diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 12f730738b8a..38c400ba1faf 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -132,7 +132,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 58fd435691f4..864eb470bb0a 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -59,7 +59,7 @@ while true; do
         fi
 done
 
-echo "--- Pulling container" 
+echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 
-if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
 
 if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
   commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 
-if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
-  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
 
 if [[ $commands == *"pytest -v -s lora"* ]]; then
   commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
@@ -181,13 +177,13 @@ if [[ -z "$render_gid" ]]; then
   exit 1
 fi
 
-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
 if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used   
-  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
     # assign shard-id for each shard
-    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
     echo "Shard ${GPU} commands:$commands_gpu"
     echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
     docker run \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 7927aef19e4e..7479c43977d7 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -49,6 +49,7 @@ function cpu_tests() {
   # Run kernel tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
     pytest -x -v -s tests/kernels/test_onednn.py"
 
   # Run basic model test
@@ -76,7 +77,7 @@ function cpu_tests() {
   # Run AWQ test
   # docker exec cpu-test-"$NUMA_NODE" bash -c "
   #   set -e
-  #   VLLM_USE_V1=0 pytest -x -s -v \
+  #   pytest -x -s -v \
   #   tests/quantization/test_ipex_quant.py"
 
   # Run multi-lora tests
@@ -116,4 +117,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 27ed67c4517e..d49f3e2f47cf 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -46,6 +46,6 @@ docker run \
     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
     pytest -v -s v1/structured_output
     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
     pytest -v -s v1/test_serial_utils.py
 '
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index bb5ef5d62463..2471b509a9ff 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -226,6 +226,27 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  #- export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
 - label: EPLB Algorithm Test # 5min
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -238,11 +259,11 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_algo.py
 
-- label: EPLB Execution Test # 5min
+- label: EPLB Execution Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
-  timeout_in_minutes: 15
+  timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -250,6 +271,7 @@ steps:
   - tests/distributed/test_eplb_execute.py
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
 
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
@@ -273,7 +295,7 @@ steps:
 
 - label: Regression Test # 7min
   timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
@@ -288,7 +310,7 @@ steps:
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
-  #grade: Blocking
+  # grade: Blocking
   source_file_dependencies:
   - vllm/
   - tests/engine
@@ -337,6 +359,7 @@ steps:
     - tests/v1
   commands:
     # split the test to avoid interference
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - pytest -v -s -m 'not cpu_test' v1/core
     - pytest -v -s v1/executor
     - pytest -v -s v1/kv_offload
@@ -344,14 +367,29 @@ steps:
     - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
     - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
     - pytest -v -s -m 'not cpu_test' v1/metrics
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
+# TODO: Add the "V1 Test attetion (MI300)" test group
+
+- label: V1 Test attention (H100) # 10min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
 - label: V1 Test others (CPU) # 5 mins
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -478,10 +516,11 @@ steps:
   - tests/compile
   commands:
   - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/test_multimodal_compile.py
   - pytest -v -s compile/piecewise/
 
-- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 35
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -490,8 +529,23 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph.py
-  - pytest -v -s compile/test_fusions_e2e.py
+  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
 
 - label: Kernels Core Operation Test # 48min
   timeout_in_minutes: 75
@@ -543,6 +597,8 @@ steps:
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
   - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
   commands:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
@@ -561,10 +617,13 @@ steps:
 
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
+  torch_nightly: true
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
   - vllm/model_executor
   - tests/model_executor
   - tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -860,9 +919,10 @@ steps:
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
 - label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  timeout_in_minutes: 70
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
-  timeout_in_minutes: 15
+  # grade: Blocking
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - vllm/multimodal/
@@ -933,6 +993,7 @@ steps:
 - label: Transformers Nightly Models Test
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
+  # grade: Blocking
   working_dir: "/vllm-workspace/"
   optional: true
   commands:
@@ -960,11 +1021,16 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
     - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
     - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
     - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -1001,12 +1067,39 @@ steps:
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusions_e2e.py
+  - tests/compile/test_full_graph.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
     - pytest -v -s tests/compile/test_fusions_e2e.py
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
-- label: Blackwell GPT-OSS Eval
+- label: ROCm GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
-  gpu: b200
+  agent_pool: mi325_1
+  mirror_hardwares: [amdproduction]
   optional: true # run on nightlies
   source_file_dependencies:
   - tests/evals/gpt_oss
@@ -1015,7 +1108,7 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   commands:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 
 - label: Blackwell Quantized MoE Test
   timeout_in_minutes: 60
@@ -1252,6 +1345,7 @@ steps:
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_4
+  # grade: Blocking
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -1266,6 +1360,9 @@ steps:
 ##### A100 test #####
 
 - label: Distributed Tests (A100) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
   gpu: a100
   optional: true
   num_gpus: 4
@@ -1280,6 +1377,9 @@ steps:
   - pytest -v -s -x lora/test_mixtral.py
 
 - label: LM Eval Large Models # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
   gpu: a100
   optional: true
   num_gpus: 4
@@ -1291,8 +1391,27 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
   gpu: h200
   optional: true
   working_dir: "/vllm-workspace/"
@@ -1304,6 +1423,7 @@ steps:
     - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1314,6 +1434,7 @@ steps:
   commands:
     - pytest -v -s tests/distributed/test_context_parallel.py
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
@@ -1329,3 +1450,27 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3152cd6488f3..7396a3875c2b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -25,6 +25,7 @@
 #     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
 # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
 # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.
 
 # When adding a test
 # - If the test belongs to an existing group, add it there
@@ -56,7 +57,7 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
   timeout_in_minutes: 10
   source_file_dependencies:
   - vllm/
@@ -65,6 +66,7 @@ steps:
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/transformers_utils
+  - tests/config
   no_gpu: true
   commands:
   - python3 standalone_tests/lazy_imports.py
@@ -72,6 +74,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s transformers_utils
+  - pytest -v -s config
 
 - label: Python-only Installation Test # 10min
   timeout_in_minutes: 20
@@ -329,6 +332,7 @@ steps:
     - pytest -v -s -m 'not cpu_test' v1/metrics
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -441,6 +445,8 @@ steps:
     - vllm/
     - tests/compile
   commands:
+    - pytest -v -s compile/test_graph_partition.py
+    - pytest -v -s compile/test_config.py
     - pytest -v -s compile/test_pass_manager.py
     - pytest -v -s compile/test_fusion.py
     - pytest -v -s compile/test_fusion_attn.py
@@ -450,6 +456,8 @@ steps:
     - pytest -v -s compile/test_decorator.py
     - pytest -v -s compile/test_noop_elimination.py
     - pytest -v -s compile/test_aot_compile.py
+    - pytest -v -s compile/test_qk_norm_rope_fusion.py
+    - pytest -v -s compile/test_compile_ranges.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -471,10 +479,11 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph.py
-    # Limit to no custom ops to reduce running time 
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -604,6 +613,7 @@ steps:
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
+  autorun_on_main: true
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
@@ -867,12 +877,12 @@ steps:
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
     - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
     - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
@@ -890,11 +900,16 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
     - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
     - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
     - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -912,7 +927,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
 
-- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   gpu: b200
@@ -932,8 +947,10 @@ steps:
     # this runner has 2 GPUs available even though num_gpus=2 is not set
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml 
-    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -951,6 +968,7 @@ steps:
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
   - tests/compile/test_fusions_e2e.py
+  - tests/compile/test_full_graph.py
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
@@ -1250,7 +1268,8 @@ steps:
     - pytest -v -s tests/compile/test_async_tp.py
     - pytest -v -s tests/compile/test_sequence_parallelism.py
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 23def076cf88..6e178bb690c5 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,8 +3,8 @@
 
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
@@ -20,15 +20,15 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
+/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 
 # vLLM V1
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
@@ -36,11 +36,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/offloading @ApostaC
 
 # Test ownership
-/.buildkite/lm-eval-harness @mgoin @simon-mo
+/.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
 /tests/evals @mgoin
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
@@ -49,7 +49,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
@@ -57,10 +57,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
 
-# Transformers backend
+# Transformers modeling backend
 /vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor
 
+# Observability
+/vllm/config/observability.py @markmc
+/vllm/v1/metrics @markmc
+/tests/v1/metrics @markmc
+/vllm/tracing.py @markmc
+/tests/v1/tracing/test_tracing.py @markmc
+/vllm/config/kv_events.py @markmc
+/vllm/distributed/kv_events.py @markmc
+/tests/distributed/test_events.py @markmc
+
 # Docs
 /docs/mkdocs @hmellor
 /docs/**/*.yml @hmellor
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 18d4a2e83144..997a40e18e58 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -151,6 +151,23 @@ pull_request_rules:
       add:
         - gpt-oss
 
+- name: label-nvidia
+  description: Automatically apply nvidia label
+  conditions:
+    - label != stale
+    - or:
+      - files~=cuda
+      - files~=cutlass
+      - files~=flashinfer
+      - files~=trtllm
+      - title~=(?i)NVIDIA
+      - title~=(?i)CUDA
+      - title~=(?i)CUTLASS
+  actions:
+    label:
+      add:
+        - nvidia
+
 - name: label-rocm
   description: Automatically apply rocm label
   conditions:
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
new file mode 100644
index 000000000000..42b05ecd5ac0
--- /dev/null
+++ b/.github/workflows/macos-smoke-test.yml
@@ -0,0 +1,81 @@
+name: macOS Apple Silicon Smoke Test
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:  # Manual trigger
+
+jobs:
+  macos-m1-smoke-test:
+    runs-on: macos-latest
+    timeout-minutes: 20
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          cache-dependency-glob: |
+            requirements/**/*.txt
+            pyproject.toml
+          python-version: '3.12'
+
+      - name: Create virtual environment
+        run: |
+          uv venv
+          echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
+
+      - name: Install dependencies and build vLLM
+        run: |
+          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
+          uv pip install -e .
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
+
+      - name: Verify installation
+        run: |
+          python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+          python -c "import torch; print(f'PyTorch: {torch.__version__}')"
+
+      - name: Smoke test vllm serve
+        timeout-minutes: 10
+        run: |
+          # Start server in background
+          vllm serve Qwen/Qwen3-0.6B \
+            --max-model-len=2048 \
+            --load-format=dummy \
+            --enforce-eager \
+            --port 8000 &
+
+          SERVER_PID=$!
+
+          # Wait for server to start
+          for i in {1..30}; do
+            if curl -s http://localhost:8000/health > /dev/null; then
+              echo "Server started successfully"
+              break
+            fi
+            if [ "$i" -eq 30 ]; then
+              echo "Server failed to start"
+              kill "$SERVER_PID"
+              exit 1
+            fi
+            sleep 2
+          done
+
+          # Test health endpoint
+          curl -f http://localhost:8000/health
+
+          # Test completion
+          curl -f http://localhost:8000/v1/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "Qwen/Qwen3-0.6B",
+              "prompt": "Hello",
+              "max_tokens": 5
+            }'
+
+          # Cleanup
+          kill "$SERVER_PID"
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
index cd9df57cd980..937487f47364 100644
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -3,10 +3,9 @@ MD007:
 MD013: false
 MD024:
   siblings_only: true
+MD031:
+  list_items: false
 MD033: false
-MD045: false
 MD046: false
-MD051: false
 MD052: false
-MD053: false
 MD059: false
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e9fa63b178e..3a37040edbf1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,13 @@ set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
 
+# ROCm installation prefix. Default to /opt/rocm but allow override via
+# -DROCM_PATH=/your/rocm/path when invoking cmake.
+if(NOT DEFINED ROCM_PATH)
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
+else()
+  set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
+endif()
 #
 # Supported/expected torch versions for CUDA/ROCm.
 #
@@ -237,10 +244,27 @@ set_gencode_flags_for_srcs(
   SRCS "${VLLM_CUMEM_EXT_SRC}"
   CUDA_ARCHS "${CUDA_ARCHS}")
 
-if(VLLM_GPU_LANG STREQUAL "CUDA")
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling cumem allocator extension.")
-  # link against cuda driver library
-  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  if(VLLM_GPU_LANG STREQUAL "CUDA")
+    # link against cuda driver library
+    list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  else()
+    # link against rocm driver library. Prefer an absolute path to
+    # libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
+    # back to linking by name "amdhip64".
+    find_library(AMDHIP64_LIB
+      NAMES amdhip64 libamdhip64.so
+      PATHS ${ROCM_PATH}/lib
+      NO_DEFAULT_PATH)
+    if(AMDHIP64_LIB)
+      message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
+      list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
+    else()
+      message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
+      list(APPEND CUMEM_LIBS amdhip64)
+    endif()
+  endif()
   define_extension_target(
     cumem_allocator
     DESTINATION vllm
@@ -265,6 +289,7 @@ set(VLLM_EXT_SRC
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
+  "csrc/fused_qknorm_rope_kernel.cu"
   "csrc/layernorm_quant_kernels.cu"
   "csrc/sampler.cu"
   "csrc/cuda_view.cu"
@@ -330,7 +355,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
 
     #
@@ -836,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # Hadacore kernels
-  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if(HADACORE_ARCHS)
     set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
     set_gencode_flags_for_srcs(
@@ -914,7 +939,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #
diff --git a/README.md b/README.md
index b5e230e4b9b0..033e1035d891 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 
 *Latest News* 🔥
 
+- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
 - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py
new file mode 100755
index 000000000000..b5c16c42de46
--- /dev/null
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
+
+This benchmark runs the same workload twice:
+1. With VLLM_BATCH_INVARIANT=0 (baseline)
+2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
+
+And reports the timing and throughput metrics for comparison.
+
+Environment variables:
+    VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
+    VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
+    VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
+    VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
+    VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
+    VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
+    VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
+    VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
+    VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
+    VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
+    VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
+
+Example usage:
+    # Benchmark qwen3 (default)
+    python benchmarks/benchmark_batch_invariance.py
+
+    # Benchmark deepseek with 8 GPUs
+    VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
+        python benchmarks/benchmark_batch_invariance.py
+
+    # Quick test with fewer trials
+    VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
+        python benchmarks/benchmark_batch_invariance.py
+"""
+
+import contextlib
+import os
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    """Generate a random prompt for benchmarking."""
+    prompt_templates = [
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def run_benchmark_with_batch_invariant(
+    model: str,
+    tp_size: int,
+    max_batch_size: int,
+    num_trials: int,
+    min_prompt: int,
+    max_prompt: int,
+    max_tokens: int,
+    temperature: float,
+    gpu_mem_util: float,
+    max_model_len: int,
+    backend: str,
+    batch_invariant: bool,
+    seed: int = 12345,
+) -> dict:
+    """
+    Run the benchmark with the specified configuration.
+
+    Returns a dict with timing and throughput metrics.
+    """
+    random.seed(seed)
+
+    # Set environment variables
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    if batch_invariant:
+        os.environ["VLLM_BATCH_INVARIANT"] = "1"
+    else:
+        os.environ["VLLM_BATCH_INVARIANT"] = "0"
+
+    print(f"\n{'=' * 80}")
+    print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
+    print(f"  Model: {model}")
+    print(f"  TP Size: {tp_size}")
+    print(f"  Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Trials: {num_trials}")
+    print(f"  Max Tokens: {max_tokens}")
+    print(f"{'=' * 80}\n")
+
+    sampling = SamplingParams(
+        temperature=temperature,
+        top_p=0.95,
+        max_tokens=max_tokens,
+        seed=20240919,
+    )
+
+    needle_prompt = "There once was a "
+
+    llm = None
+    try:
+        # Create LLM engine
+        start_init = time.perf_counter()
+        llm = LLM(
+            model=model,
+            max_num_seqs=max_batch_size,
+            gpu_memory_utilization=gpu_mem_util,
+            max_model_len=max_model_len,
+            dtype="bfloat16",
+            tensor_parallel_size=tp_size,
+            enable_prefix_caching=False,
+        )
+        init_time = time.perf_counter() - start_init
+        print(f"Engine initialization time: {init_time:.2f}s\n")
+
+        # Generate baseline
+        print("Generating baseline (warmup)...")
+        baseline_out = llm.generate([needle_prompt], sampling)
+        assert len(baseline_out) == 1
+        baseline_text = baseline_out[0].outputs[0].text
+        print(f"Baseline output: '{baseline_text[:50]}...'\n")
+
+        # Run trials and measure timing
+        trial_times: list[float] = []
+        total_tokens = 0
+        total_prompts = 0
+
+        for trial in range(num_trials):
+            # Create a batch
+            prompts: list[str] = []
+            batch_size = random.randint(max_batch_size // 2, max_batch_size)
+            needle_pos = random.randint(0, batch_size - 1)
+            for i in range(batch_size):
+                if i == needle_pos:
+                    prompts.append(needle_prompt)
+                else:
+                    prompts.append(_random_prompt(min_prompt, max_prompt))
+
+            # Measure time for this trial
+            start_time = time.perf_counter()
+            outputs = llm.generate(prompts, sampling)
+            trial_time = time.perf_counter() - start_time
+
+            trial_times.append(trial_time)
+            total_prompts += len(prompts)
+
+            # Count tokens
+            for output in outputs:
+                if output.outputs:
+                    total_tokens += len(output.outputs[0].token_ids)
+
+            print(
+                f"Trial {trial + 1}/{num_trials}: "
+                f"batch_size={batch_size}, "
+                f"time={trial_time:.2f}s"
+            )
+
+            # Verify needle output still matches
+            needle_output = outputs[needle_pos]
+            assert needle_output.prompt == needle_prompt
+
+        # Compute statistics
+        avg_time = sum(trial_times) / len(trial_times)
+        min_time = min(trial_times)
+        max_time = max(trial_times)
+        throughput = total_tokens / sum(trial_times)
+        prompts_per_sec = total_prompts / sum(trial_times)
+
+        print(f"\n{'=' * 80}")
+        print("RESULTS:")
+        print(f"  Average time per trial: {avg_time:.2f}s")
+        print(f"  Min time: {min_time:.2f}s")
+        print(f"  Max time: {max_time:.2f}s")
+        print(f"  Total tokens generated: {total_tokens}")
+        print(f"  Total prompts processed: {total_prompts}")
+        print(f"  Throughput: {throughput:.2f} tokens/s")
+        print(f"  Prompts/s: {prompts_per_sec:.2f}")
+        print(f"{'=' * 80}\n")
+
+        return {
+            "init_time": init_time,
+            "avg_time": avg_time,
+            "min_time": min_time,
+            "max_time": max_time,
+            "total_tokens": total_tokens,
+            "total_prompts": total_prompts,
+            "throughput": throughput,
+            "prompts_per_sec": prompts_per_sec,
+            "trial_times": trial_times,
+        }
+
+    finally:
+        # Cleanup
+        if llm is not None:
+            with contextlib.suppress(Exception):
+                llm.shutdown()
+
+
+def main():
+    # Check platform support
+    if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
+        print("ERROR: Requires CUDA and >= Hopper (SM90)")
+        print(f"Current platform: {current_platform.device_type}")
+        if current_platform.is_cuda():
+            print(f"Device capability: {current_platform.get_device_capability()}")
+        return 1
+
+    # Read configuration from environment
+    model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
+    max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
+    num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
+    min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
+    max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
+    max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
+    temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
+    gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
+    max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
+    backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
+
+    print("\n" + "=" * 80)
+    print("VLLM BATCH INVARIANCE BENCHMARK")
+    print("=" * 80)
+    print("\nConfiguration:")
+    print(f"  Model: {model}")
+    print(f"  Tensor Parallel Size: {tp_size}")
+    print(f"  Attention Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Number of Trials: {num_trials}")
+    print(f"  Prompt Length Range: {min_prompt}-{max_prompt} words")
+    print(f"  Max Tokens to Generate: {max_tokens}")
+    print(f"  Temperature: {temperature}")
+    print(f"  GPU Memory Utilization: {gpu_mem_util}")
+    print(f"  Max Model Length: {max_model_len}")
+    print("=" * 80)
+
+    # Run benchmark WITHOUT batch invariance (baseline)
+    print("\n" + "=" * 80)
+    print("PHASE 1: Running WITHOUT batch invariance (baseline)")
+    print("=" * 80)
+    baseline_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=False,
+    )
+
+    # Run benchmark WITH batch invariance
+    print("\n" + "=" * 80)
+    print("PHASE 2: Running WITH batch invariance")
+    print("=" * 80)
+    batch_inv_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=True,
+    )
+
+    # Compare results
+    print("\n" + "=" * 80)
+    print("COMPARISON: Batch Invariance vs Baseline")
+    print("=" * 80)
+
+    init_overhead_pct = (
+        (batch_inv_results["init_time"] - baseline_results["init_time"])
+        / baseline_results["init_time"]
+        * 100
+    )
+    time_overhead_pct = (
+        (batch_inv_results["avg_time"] - baseline_results["avg_time"])
+        / baseline_results["avg_time"]
+        * 100
+    )
+    throughput_change_pct = (
+        (batch_inv_results["throughput"] - baseline_results["throughput"])
+        / baseline_results["throughput"]
+        * 100
+    )
+
+    print("\nInitialization Time:")
+    print(f"  Baseline:         {baseline_results['init_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['init_time']:.2f}s")
+    print(f"  Overhead:         {init_overhead_pct:+.2f}%")
+
+    print("\nAverage Trial Time:")
+    print(f"  Baseline:         {baseline_results['avg_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['avg_time']:.2f}s")
+    print(f"  Overhead:         {time_overhead_pct:+.2f}%")
+
+    print("\nThroughput (tokens/s):")
+    print(f"  Baseline:         {baseline_results['throughput']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['throughput']:.2f}")
+    print(f"  Change:           {throughput_change_pct:+.2f}%")
+
+    print("\nPrompts/s:")
+    print(f"  Baseline:         {baseline_results['prompts_per_sec']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['prompts_per_sec']:.2f}")
+
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    if time_overhead_pct > 0:
+        print(
+            f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
+            "overhead"
+        )
+    else:
+        print(
+            f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
+            "faster (unexpected!)"
+        )
+
+    if abs(throughput_change_pct) < 1.0:
+        print("Throughput difference is negligible (< 1%)")
+    elif throughput_change_pct < 0:
+        print(
+            f"Throughput decreased by {-throughput_change_pct:.1f}% "
+            "with batch invariance"
+        )
+    else:
+        print(
+            f"Throughput increased by {throughput_change_pct:.1f}% "
+            "with batch invariance (unexpected!)"
+        )
+
+    print("=" * 80 + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 146c268a6b7f..28fc383a318d 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
 
     # Remove the special tokens.
     return random.choices(
-        [v for k, v in vocab.items() if k not in all_special_ids],
+        [v for v in vocab.values() if v not in all_special_ids],
         k=length,
     )
 
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index f1e504499eaf..11e3ac7f0c1f 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -1,10 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
+
+# Disable DeepGEMM for this benchmark to use CUTLASS
+os.environ["VLLM_USE_DEEP_GEMM"] = "0"
+
 import torch
 
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_w8a8_block_fp8_linear,
+    W8A8BlockFp8LinearOp,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_BLOCK_FP8_SUPPORTED,
@@ -39,13 +47,14 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
-    # Create random FP8 tensors
+    # Create random input tensor (bfloat16, will be quantized by W8A8BlockFp8LinearOp)
     A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
 
+    # Create quantized weight tensor
     B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
     B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
-    # Create scales
+    # Create weight scales
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
     k_tiles = (K + block_k - 1) // block_k
@@ -55,19 +64,25 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
         * factor_for_scale
     )
 
-    # SM90 CUTLASS requires row-major format for scales
-    if use_cutlass and current_platform.is_device_capability(90):
-        Bs = Bs.T.contiguous()
+    # Create W8A8BlockFp8LinearOp instance
+    weight_group_shape = GroupShape(block_n, block_k)
+    act_quant_group_shape = GroupShape(1, block_k)  # Per-token, per-group quantization
+
+    linear_op = W8A8BlockFp8LinearOp(
+        weight_group_shape=weight_group_shape,
+        act_quant_group_shape=act_quant_group_shape,
+        cutlass_block_fp8_supported=use_cutlass,
+        use_aiter_and_is_supported=False,
+    )
 
     def run():
-        if use_cutlass:
-            return apply_w8a8_block_fp8_linear(
-                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
-            )
-        else:
-            return apply_w8a8_block_fp8_linear(
-                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
-            )
+        return linear_op.apply(
+            input=A_ref,
+            weight=B,
+            weight_scale=Bs,
+            input_scale=None,
+            bias=None,
+        )
 
     return run
 
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 29ef6409bb16..074b7a440b61 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,97 +1,76 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from itertools import accumulate
+import itertools
 
-import nvtx
 import torch
 
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
-from vllm.platforms import current_platform
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
+batch_size_range = [2**i for i in range(0, 8, 2)]
+seq_len_range = [2**i for i in range(6, 10, 1)]
+num_heads_range = [32, 48]
+configs = list(itertools.product(batch_size_range, seq_len_range, num_heads_range))
 
-def benchmark_rope_kernels_multi_lora(
-    is_neox_style: bool,
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    head_size: int,
-    rotary_dim: int | None,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    max_position: int = 8192,
-    base: float = 10000,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    # silulating serving 4 LoRAs
-    scaling_factors = [1, 2, 4, 8]
-    # batched RoPE can take multiple scaling factors
-    batched_rope = get_rope(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-        {"rope_type": "linear", "factor": tuple(scaling_factors)},
+
+def get_benchmark(head_size, rotary_dim, is_neox_style, device):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len", "num_heads"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["torch", "flashinfer", "vllm"],
+            line_names=["PyTorch", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=f"rope-perf{'-neox-style' if is_neox_style else ''}",
+            args={},
+        )
     )
-    # non-batched RoPE takes only one scaling factor, we create multiple
-    # instances to simulate the same behavior
-    non_batched_ropes: list[RotaryEmbedding] = []
-    for scaling_factor in scaling_factors:
-        non_batched_ropes.append(
-            get_rope(
-                head_size,
-                rotary_dim,
-                max_position,
-                base,
-                is_neox_style,
-                {"rope_type": "linear", "factor": (scaling_factor,)},
-            )
+    def benchmark(batch_size, seq_len, num_heads, provider):
+        dtype = torch.bfloat16
+        max_position = 8192
+        base = 10000
+        rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+        rope = rope.to(dtype=dtype, device=device)
+        cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
+
+        positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
+        query = torch.randn(
+            (batch_size, seq_len, num_heads * head_size), dtype=dtype, device=device
         )
+        key = torch.randn_like(query)
 
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
-    key = torch.randn_like(query)
+        quantiles = [0.5, 0.2, 0.8]
 
-    # create query offsets for batched RoPE, we concat multiple kv cache
-    # together and each query needs to find the right kv cache of its type
-    offset_map = torch.tensor(
-        list(
-            accumulate(
-                [0]
-                + [
-                    max_position * scaling_factor * 2
-                    for scaling_factor in scaling_factors[:-1]
-                ]
+        if provider == "torch":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rope.forward_native(positions, query.clone(), key.clone()),
+                quantiles=quantiles,
             )
-        )
-    )
-    query_types = torch.randint(
-        0, len(scaling_factors), (batch_size, seq_len), device=device
-    )
-    # map query types to offsets
-    query_offsets = offset_map[query_types]
-    # the kernel takes flattened offsets
-    flatten_offsets = query_offsets.flatten()
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: torch.ops.vllm.flashinfer_rotary_embedding(
+                    positions,
+                    query.clone(),
+                    key.clone(),
+                    head_size,
+                    cos_sin_cache,
+                    is_neox_style,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rope.forward_cuda(positions, query.clone(), key.clone()),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
 
-    # batched queries of the same type together for non-batched RoPE
-    queries = [query[query_types == i] for i in range(len(scaling_factors))]
-    keys = [key[query_types == i] for i in range(len(scaling_factors))]
-    packed_qkr = zip(queries, keys, non_batched_ropes)
-    # synchronize before start timing
-    torch.cuda.synchronize()
-    with nvtx.annotate("non-batched", color="yellow"):
-        for q, k, r in packed_qkr:
-            r.forward(positions, q, k)
-    torch.cuda.synchronize()
-    with nvtx.annotate("batched", color="green"):
-        batched_rope.forward(positions, query, key, flatten_offsets)
-    torch.cuda.synchronize()
+    return benchmark
 
 
 if __name__ == "__main__":
@@ -116,17 +95,12 @@ def benchmark_rope_kernels_multi_lora(
     parser.add_argument(
         "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
     )
+    parser.add_argument("--save-path", type=str, default="./configs/rope/")
     args = parser.parse_args()
-    print(args)
 
-    benchmark_rope_kernels_multi_lora(
-        is_neox_style=args.is_neox_style,
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        num_heads=args.num_heads,
-        head_size=args.head_size,
-        rotary_dim=args.rotary_dim,
-        dtype=getattr(torch, args.dtype),
-        seed=args.seed,
-        device=args.device,
+    # Get the benchmark function
+    benchmark = get_benchmark(
+        args.head_size, args.rotary_dim, args.is_neox_style, args.device
     )
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index f5b5c6c97d48..b0be1e3a69a6 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -55,6 +55,10 @@ output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75
 ----------------------------------------------------------------------------------------------------
 ```
 
+If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
+and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
+benchmark-only runtime so the reported throughput stays comparable).
+
 ### JSON configuration file for synthetic conversations generation
 
 The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py
index 2674899d1cc5..8cb8a2f386a9 100644
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -11,6 +11,7 @@
     Color,
     logger,
 )
+from tqdm import tqdm
 from transformers import AutoTokenizer  # type: ignore
 
 # Conversation ID is a string (e.g: "UzTK34D")
@@ -417,6 +418,10 @@ def generate_conversations(
             data = file.read()
             tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
             list_of_tokens.extend(tokens_in_file)
+        logger.info(
+            f"Loaded {len(tokens_in_file)} tokens from file {filename}, "
+            f"total tokens so far: {len(list_of_tokens)}"
+        )
 
     conversations: ConversationsMap = {}
     conv_id = 0
@@ -449,18 +454,25 @@ def generate_conversations(
         )
         base_offset += common_prefix_tokens
 
-    for conv_id in range(args.num_conversations):
+    for conv_id in tqdm(
+        range(args.num_conversations),
+        total=args.num_conversations,
+        desc="Generating conversations",
+        unit="conv",
+    ):
         # Generate a single conversation
         messages: MessagesList = []
 
         nturns = turn_count[conv_id]
 
         # User prompt token count per turn (with lower limit)
-        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
+        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int)
         input_token_count = np.maximum(input_token_count, base_prompt_token_count)
 
         # Assistant answer token count per turn (with lower limit)
-        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
+        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype(
+            int
+        )
         output_token_count = np.maximum(output_token_count, 1)
 
         user_turn = True
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 5d2ac66e5ab9..e23f6b923f1b 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -55,6 +55,7 @@ class ClientArgs(NamedTuple):
     verify_output: bool
     conversation_sampling: ConversationSampling
     request_rate: float
+    max_retries: int
 
 
 class RequestArgs(NamedTuple):
@@ -63,6 +64,7 @@ class RequestArgs(NamedTuple):
     stream: bool
     limit_min_tokens: int  # Use negative value for no limit
     limit_max_tokens: int  # Use negative value for no limit
+    timeout_sec: int
 
 
 class BenchmarkArgs(NamedTuple):
@@ -214,6 +216,7 @@ async def send_request(
     stream: bool = True,
     min_tokens: int | None = None,
     max_tokens: int | None = None,
+    timeout_sec: int = 120,
 ) -> ServerResponse:
     payload = {
         "model": model,
@@ -235,10 +238,16 @@ async def send_request(
     headers = {"Content-Type": "application/json"}
 
     # Calculate the timeout for the request
-    timeout_sec = 120
     if max_tokens is not None:
         # Assume TPOT of 200ms and use max_tokens to determine timeout
-        timeout_sec = max(timeout_sec, int(max_tokens * 0.2))
+        token_based_timeout = int(max_tokens * 0.2)
+        if token_based_timeout > timeout_sec:
+            timeout_sec = token_based_timeout
+            logger.info(
+                "Using timeout of %ds based on max_tokens %d",
+                timeout_sec,
+                max_tokens,
+            )
     timeout = aiohttp.ClientTimeout(total=timeout_sec)
 
     valid_response = True
@@ -409,6 +418,7 @@ async def send_turn(
         req_args.stream,
         min_tokens,
         max_tokens,
+        req_args.timeout_sec,
     )
 
     if response.valid is False:
@@ -518,6 +528,25 @@ async def poisson_sleep(request_rate: float, verbose: bool = False) -> None:
     await asyncio.sleep(interval)
 
 
+async def exponential_backoff_sleep(
+    attempt_cnt: int,
+    base_rate: float = 1.0,
+    backoff_factor: float = 2.0,
+    jitter_fraction: float = 0.10,
+    verbose: bool = False,
+) -> None:
+    # Sleep with exponential backoff and jitter after a failed request.
+    backoff_delay = base_rate * (backoff_factor**attempt_cnt)
+    jittered_delay = backoff_delay * (
+        1 + np.random.uniform(-jitter_fraction, jitter_fraction)
+    )
+
+    if verbose:
+        logger.info(f"Backoff for {jittered_delay:.3f} seconds...")
+
+    await asyncio.sleep(jittered_delay)
+
+
 async def client_main(
     args: ClientArgs,
     req_args: RequestArgs,
@@ -532,8 +561,11 @@ async def client_main(
         f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}"  # noqa: E501
     )
 
-    random.seed(args.seed)
-    np.random.seed(args.seed)
+    # Set unique seed per client (each client runs in its own process)
+    # Add 1 to ensure no client uses the same seed as the main process
+    client_seed = args.seed + client_id + 1
+    random.seed(client_seed)
+    np.random.seed(client_seed)
 
     # Active conversations
     active_convs: ConversationsMap = {}
@@ -646,49 +678,62 @@ async def client_main(
                 )
                 time_of_last_turn[conv_id] = curr_time_sec
 
-            success = True
-            try:
-                result = await send_turn(
-                    session,
-                    client_id,
-                    conv_id,
-                    messages,
-                    current_turn,
-                    tokenizer,
-                    req_args,
-                    args.print_content,
-                    args.verify_output,
-                )
-                if result is not None:
-                    result_queue.put(result)
-                else:
-                    # None means that the request failed,
-                    # and should not be added to the statistics.
-                    success = False
-                    num_failures += 1
-
-                    logger.warning(
-                        f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+            success = False
+            for attempt_cnt in range(args.max_retries + 1):
+                try:
+                    exception = False
+                    result = await send_turn(
+                        session,
+                        client_id,
+                        conv_id,
+                        messages,
+                        current_turn,
+                        tokenizer,
+                        req_args,
+                        args.print_content,
+                        args.verify_output,
+                    )
+                    if result is not None:
+                        result_queue.put(result)
+                        success = True
+                        break
+                    else:
+                        logger.warning(
+                            f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+                        )
+                except asyncio.exceptions.TimeoutError:
+                    exception = True
+                    logger.error(
+                        "%sClient %d - Timeout during conversation ID %s (turn: %d). "
+                        "Base timeout is %ss (set with --request-timeout-sec), but the "
+                        "effective timeout may be longer based on max_tokens. If this "
+                        "is unexpected, consider increasing the timeout or checking "
+                        "model performance.%s",
+                        Color.RED,
+                        client_id,
+                        conv_id,
+                        current_turn,
+                        req_args.timeout_sec,
+                        Color.RESET,
+                    )
+                except Exception:
+                    exception = True
+                    logger.exception(
+                        f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
                     )
 
-                    # Remove the conversation (should not be used again)
-                    active_convs.pop(conv_id)
+                # Sleep before retry if not last attempt
+                if not success and attempt_cnt < args.max_retries:
+                    await exponential_backoff_sleep(attempt_cnt, verbose=args.verbose)
 
-            except asyncio.exceptions.TimeoutError:
+            if not success:
                 num_failures += 1
-                logger.exception(
-                    f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
-                )
-                break  # Exit gracefully instead of raising an error
+                # Remove the conversation (should not be used again)
+                active_convs.pop(conv_id)
+                if exception:
+                    break  # Exit gracefully instead of raising an error
 
-            except Exception:
-                num_failures += 1
-                logger.exception(
-                    f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
-                )
-                break  # Exit gracefully instead of raising an error
-
-            if success:
+            else:
                 num_successes += 1
 
                 # Update the turns counter to include the LLM response
@@ -803,6 +848,7 @@ def get_client_config(
         verify_output=args.verify_output,
         conversation_sampling=args.conversation_sampling,
         request_rate=args.request_rate,
+        max_retries=args.max_retries,
     )
 
     if args.limit_min_tokens > 0 or args.limit_max_tokens > 0:
@@ -815,6 +861,9 @@ def get_client_config(
                 "Invalid min/max tokens limits (min should not be larger than max)"
             )
 
+    if args.request_timeout_sec <= 0:
+        raise ValueError("Request timeout must be a positive number")
+
     # Arguments for API requests
     chat_url = f"{args.url}/v1/chat/completions"
     model_name = args.served_model_name if args.served_model_name else args.model
@@ -825,6 +874,7 @@ def get_client_config(
         stream=not args.no_stream,
         limit_min_tokens=args.limit_min_tokens,
         limit_max_tokens=args.limit_max_tokens,
+        timeout_sec=args.request_timeout_sec,
     )
 
     return client_args, req_args
@@ -968,7 +1018,7 @@ async def main_mp(
             f"(is alive: {client.is_alive()}){Color.RESET}"
         )
 
-        client.join(timeout=120)
+        client.join(timeout=req_args.timeout_sec + 1)
 
         if client.is_alive():
             logger.warning(
@@ -1026,6 +1076,7 @@ def process_statistics(
     verbose: bool,
     gen_conv_args: GenConvArgs | None = None,
     excel_output: bool = False,
+    warmup_runtime_sec: float | None = None,
 ) -> None:
     if len(client_metrics) == 0:
         logger.info("No samples to process")
@@ -1119,8 +1170,13 @@ def process_statistics(
         # Convert milliseconds to seconds
         runtime_sec = runtime_sec / 1000.0
         requests_per_sec = float(len(df)) / runtime_sec
-
-        params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
+        params = {
+            "runtime_sec": runtime_sec,
+            "requests_per_sec": requests_per_sec,
+        }
+        if warmup_runtime_sec is not None:
+            params["warmup_runtime_sec"] = warmup_runtime_sec
+            params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
 
         # Generate a summary of relevant metrics (and drop irrelevant data)
         df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
@@ -1334,6 +1390,16 @@ async def main() -> None:
         help="Expected request rate (Poisson process) per client in requests/sec."
         "Set to 0 for no delay between requests.",
     )
+    parser.add_argument(
+        "--max-retries",
+        type=int,
+        default=int(os.environ.get("MULTITURN_BENCH_MAX_RETRIES", "0")),
+        help="Maximum number of retry attempts for timed-out requests. "
+        "Default is 0 (no retries). "
+        "Set to higher values to retry failed requests and maintain "
+        "fair workload distribution. "
+        "Can also be set via MULTITURN_BENCH_MAX_RETRIES environment variable.",
+    )
     parser.add_argument(
         "--conversation-sampling",
         type=ConversationSampling,
@@ -1351,6 +1417,13 @@ async def main() -> None:
         action="store_true",
         help="Verify the LLM output (compare to the answers in the input JSON file)",
     )
+    parser.add_argument(
+        "--request-timeout-sec",
+        type=int,
+        default=120,
+        help="Timeout in seconds for each API request (default: 120). "
+        "Automatically increased if max tokens imply longer decoding.",
+    )
 
     parser.add_argument(
         "--no-stream",
@@ -1426,6 +1499,7 @@ async def main() -> None:
             f"Invalid --warmup-percentage={args.warmup_percentage}"
         ) from None
 
+    # Set global seeds for main process
     random.seed(args.seed)
     np.random.seed(args.seed)
 
@@ -1484,6 +1558,8 @@ async def main() -> None:
         url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
     )
 
+    warmup_runtime_sec: float | None = None
+
     # Warm-up step
     if args.warmup_step:
         # Only send a single user prompt from every conversation.
@@ -1498,26 +1574,56 @@ async def main() -> None:
         # all clients should finish their work before exiting
         warmup_bench_args = bench_args._replace(early_stop=False)
 
-        logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
+        logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
+        warmup_start_ns = time.perf_counter_ns()
         conversations, _ = await main_mp(
             warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
         )
-        logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
+        warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.PURPLE,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
 
     # Run the benchmark
-    start_time = time.perf_counter_ns()
+    benchmark_start_ns = time.perf_counter_ns()
     client_convs, client_metrics = await main_mp(
         client_args, req_args, bench_args, tokenizer, conversations
     )
-    total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
+    benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
 
     # Calculate requests per second
-    total_runtime_sec = total_runtime_ms / 1000.0
-    rps = len(client_metrics) / total_runtime_sec
+    requests_per_sec = len(client_metrics) / benchmark_runtime_sec
+    benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
     logger.info(
-        f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
-        f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
+        "%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
+        "requests per second: %.3f%s",
+        Color.GREEN,
+        benchmark_runtime_sec,
+        benchmark_runtime_ms,
+        requests_per_sec,
+        Color.RESET,
     )
+    if warmup_runtime_sec is not None:
+        total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info(
+            "%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            total_runtime_sec,
+            total_runtime_sec * 1000,
+            Color.RESET,
+        )
 
     # Benchmark parameters
     params = {
@@ -1542,6 +1648,7 @@ async def main() -> None:
         verbose=args.verbose,
         gen_conv_args=gen_conv_args,
         excel_output=args.excel_output,
+        warmup_runtime_sec=warmup_runtime_sec,
     )
 
     if args.output_file is not None:
diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt
index f0e1935914a1..bae656a5c5c4 100644
--- a/benchmarks/multi_turn/requirements.txt
+++ b/benchmarks/multi_turn/requirements.txt
@@ -2,4 +2,5 @@ numpy>=1.24
 pandas>=2.0.0
 aiohttp>=3.10
 transformers>=4.46
-xlsxwriter>=3.2.1
\ No newline at end of file
+xlsxwriter>=3.2.1
+tqdm>=4.66
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index dbda19fbcbf2..aa84125818d1 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -15,6 +15,7 @@ endif()
 #
 set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
 set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
+set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
@@ -140,6 +141,22 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
         set(ENABLE_AVX512VNNI OFF)
         message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
     endif()
+
+    find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND)
+    if (AMXBF16_FOUND OR ENABLE_AMXBF16)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+            list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile")
+            set(ENABLE_AMXBF16 ON)
+            add_compile_definitions(-DCPU_CAPABILITY_AMXBF16)
+        else()
+            set(ENABLE_AMXBF16 OFF)
+            message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3")
+        endif()
+    else()
+        set(ENABLE_AMXBF16 OFF)
+        message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.")
+    endif()
     
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
@@ -193,7 +210,30 @@ endif()
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
     # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
+    set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
     if(ASIMD_FOUND)
+        # Set number of parallel build processes
+        include(ProcessorCount)
+        ProcessorCount(NPROC)
+        if(NOT NPROC)
+            set(NPROC 4)
+        endif()
+        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+        # and create a local shim dir with it
+        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+        find_library(OPEN_MP
+            NAMES gomp
+            PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
+            NO_DEFAULT_PATH
+            REQUIRED
+        )
+        # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
+        if (OPEN_MP)
+            set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+        endif()
+
+        # Fetch and populate ACL
         if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
             message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
         else()
@@ -202,43 +242,58 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
                 SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
                 SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
                 GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
-                GIT_TAG        v52.2.0
+                GIT_TAG        v52.6.0
                 GIT_SHALLOW    TRUE
                 GIT_PROGRESS   TRUE
             )
             set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
+            set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build")
         endif()
 
-        # Build ACL with scons
-        include(ProcessorCount)
-        ProcessorCount(_NPROC)
-        set(_scons_cmd
-        scons -j${_NPROC}
-            Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
-            arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
-            multi_isa=1 openmp=1 cppthreads=0
+        # Build ACL with CMake
+        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
+        set(CMAKE_BUILD_TYPE "Release")
+        set(ARM_COMPUTE_ARCH "armv8.2-a")
+        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
+        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
+        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
+        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
+        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
+        set(ARM_COMPUTE_BUILD_TESTING "OFF")
+
+        set(_cmake_config_cmd
+             ${CMAKE_COMMAND} -G Ninja -B build 
+            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
+            -DCMAKE_BUILD_TYPE=Release 
+            -DARM_COMPUTE_ARCH=armv8.2-a 
+            -DARM_COMPUTE_ENABLE_ASSERTS=OFF 
+            -DARM_COMPUTE_ENABLE_CPPTHREADS=OFF 
+            -DARM_COMPUTE_ENABLE_OPENMP=ON 
+            -DARM_COMPUTE_ENABLE_WERROR=OFF 
+            -DARM_COMPUTE_BUILD_EXAMPLES=OFF 
+            -DARM_COMPUTE_BUILD_TESTING=OFF)
+        set(_cmake_build_cmd
+            ${CMAKE_COMMAND} --build build -- -j${NPROC}
         )
 
-        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
-        # and create a local shim dir with it
-        include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
-        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
-
-        if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
-            list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
-        endif()
-
         execute_process(
-            COMMAND ${_scons_cmd}
+            COMMAND ${_cmake_config_cmd}
+            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+        )
+        execute_process(
+            COMMAND ${_cmake_build_cmd}
             WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
             RESULT_VARIABLE _acl_rc
         )
+
         if(NOT _acl_rc EQUAL 0)
             message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
         endif()
+        message(STATUS "Arm Compute Library (ACL) built successfully.")
 
-        set(ONEDNN_AARCH64_USE_ACL "ON")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+        # VLLM/oneDNN settings for ACL
+        set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
         add_compile_definitions(VLLM_USE_ACL)
     endif()
 
@@ -255,7 +310,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
         FetchContent_Declare(
             oneDNN
             GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-            GIT_TAG v3.9
+            GIT_TAG v3.10
             GIT_PROGRESS TRUE
             GIT_SHALLOW TRUE
         )
@@ -275,7 +330,10 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
     set(ONEDNN_VERBOSE "OFF")
     set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
+    set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+    set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
     FetchContent_MakeAvailable(oneDNN)
+    set(CMAKE_BUILD_TYPE ${VLLM_BUILD_TYPE})
     add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
     target_include_directories(
         dnnl_ext
@@ -305,14 +363,14 @@ endif()
 #
 set(VLLM_EXT_SRC
     "csrc/cpu/activation.cpp"
-    "csrc/cpu/attention.cpp"
-    "csrc/cpu/cache.cpp"
     "csrc/cpu/utils.cpp"
     "csrc/cpu/layernorm.cpp"
     "csrc/cpu/mla_decode.cpp"
     "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp"
-    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
+    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
+    "csrc/cpu/cpu_attn.cpp"
+    "csrc/cpu/scratchpad_manager.cpp"
+    "csrc/cpu/torch_bindings.cpp")
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 931090db50e9..567c8959f045 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
+          GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
deleted file mode 100644
index 82862fea7f2b..000000000000
--- a/csrc/cpu/attention.cpp
+++ /dev/null
@@ -1,798 +0,0 @@
-#include "cpu_types.hpp"
-
-namespace {
-
-template <typename scalar_t>
-struct KernelVecType {
-  using q_load_vec_type = void;
-  using q_vec_type = void;
-  using k_load_vec_type = void;
-  using k_vec_type = void;
-  using qk_acc_vec_type = void;
-  using v_load_vec_type = void;
-};
-
-template <>
-struct KernelVecType<float> {
-  using q_load_vec_type = vec_op::FP32Vec4;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::FP32Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-};
-
-template <>
-struct KernelVecType<c10::Half> {
-#if defined(__powerpc64__) || defined(__s390x__)
-  // Power and s390x architecture-specific vector types
-  using q_load_vec_type = vec_op::FP32Vec8;
-  using k_load_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::FP32Vec16;
-#else
-  // Fallback for other architectures, including x86
-  using q_load_vec_type = vec_op::FP16Vec8;
-  using k_load_vec_type = vec_op::FP16Vec16;
-  using v_load_vec_type = vec_op::FP16Vec16;
-#endif
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-};
-
-#ifdef __AVX512BF16__
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::BF16Vec32;
-  using k_load_vec_type = vec_op::BF16Vec32;
-  using k_vec_type = vec_op::BF16Vec32;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-#else
-  #ifdef __aarch64__
-    #ifndef ARM_BF16_SUPPORT
-    // pass
-    #else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::BF16Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-    #endif
-  #else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::BF16Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
-  #endif
-#endif
-
-template <typename T>
-FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
-                                           const int capacity) {
-  T max = data[0];
-  for (int i = 1; i < size; ++i) {
-    max = max >= data[i] ? max : data[i];
-  }
-
-  T sum = 0;
-  for (int i = 0; i < size; ++i) {
-    data[i] = std::exp(data[i] - max);
-    sum += data[i];
-  }
-
-  int i = 0;
-  for (; i < size; ++i) {
-    data[i] /= sum;
-  }
-
-  for (; i < capacity; ++i) {
-    data[i] = 0;
-  }
-
-  return {max, sum};
-}
-
-template <typename T>
-FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
-                                                const int capacity,
-                                                const float alibi_slope,
-                                                const int start_index,
-                                                const int seq_len) {
-  data[0] += alibi_slope * (start_index - seq_len + 1);
-  T max = data[0];
-  for (int i = 1; i < size; ++i) {
-    T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
-    data[i] = qk;
-    max = max >= qk ? max : qk;
-  }
-
-  T sum = 0;
-  for (int i = 0; i < size; ++i) {
-    data[i] = std::exp(data[i] - max);
-    sum += data[i];
-  }
-
-  int i = 0;
-  for (; i < size; ++i) {
-    data[i] /= sum;
-  }
-
-  for (; i < capacity; ++i) {
-    data[i] = 0;
-  }
-
-  return {max, sum};
-}
-
-template <typename T>
-FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
-                                         const int size) {
-  T max = max_data[0];
-  for (int i = 1; i < size; ++i) {
-    max = max >= max_data[i] ? max : max_data[i];
-  }
-
-  T rescaled_sum = 0;
-  for (int i = 0; i < size; ++i) {
-    T rescale_factor = std::exp(max_data[i] - max);
-    rescaled_sum += rescale_factor * sum_data[i];
-    sum_data[i] *= rescale_factor;
-  }
-  for (int i = 0; i < size; ++i) {
-    sum_data[i] /= rescaled_sum + 1e-8;
-  }
-}
-
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int x>
-struct reduceQKBlockKernel {
-  using q_load_vec_type = typename KernelVecType<scalar_t>::q_load_vec_type;
-  using q_vec_type = typename KernelVecType<scalar_t>::q_vec_type;
-  using k_load_vec_type = typename KernelVecType<scalar_t>::k_load_vec_type;
-  using k_vec_type = typename KernelVecType<scalar_t>::k_vec_type;
-  using qk_acc_vec_type = typename KernelVecType<scalar_t>::qk_acc_vec_type;
-
-  constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
-  constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
-  constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
-
-  static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
-  static_assert(k_load_vec_type::get_elem_num() % x == 0);
-  static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
-
-  FORCE_INLINE static void call(const scalar_t* __restrict__ q,
-                                const scalar_t* __restrict__ k_block,
-                                float* __restrict__ logits, float scale,
-                                const int token_num) {
-    const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
-
-    qk_acc_vec_type group_accums[MAX_GROUP_NUM];
-    if (token_num == BLOCK_SIZE) {
-      for (int q_offset = 0; q_offset < HEAD_SIZE;
-           q_offset += x, k_block += x * BLOCK_SIZE) {
-        q_load_vec_type q_load_group_vec(q + q_offset);
-        q_vec_type q_group_vec(q_load_group_vec);
-
-        vec_op::unroll_loop<int, MAX_GROUP_NUM>(
-            [k_block, &q_group_vec, &group_accums](int token_group_idx) {
-              k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
-                                                             TOKEN_PER_GROUP);
-              k_vec_type k_group_vec(k_load_group_vec);
-              vec_op::fma(group_accums[token_group_idx], q_group_vec,
-                          k_group_vec);
-              vec_op::prefetch(k_block + x * BLOCK_SIZE +
-                               token_group_idx * x * TOKEN_PER_GROUP);
-            });
-      }
-    } else {
-      for (int q_offset = 0; q_offset < HEAD_SIZE;
-           q_offset += x, k_block += x * BLOCK_SIZE) {
-        q_load_vec_type q_load_group_vec(q + q_offset);
-        q_vec_type q_group_vec(q_load_group_vec);
-        for (int token_group_start = 0; token_group_start < group_num;
-             token_group_start += UNROLL_GROUP_NUM) {
-          vec_op::unroll_loop<int, UNROLL_GROUP_NUM>(
-              [token_group_start, k_block, &q_group_vec,
-               &group_accums](int token_group_idx) {
-                token_group_idx += token_group_start;
-                k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
-                                                               TOKEN_PER_GROUP);
-                k_vec_type k_group_vec(k_load_group_vec);
-                vec_op::fma(group_accums[token_group_idx], q_group_vec,
-                            k_group_vec);
-                vec_op::prefetch(k_block + x * BLOCK_SIZE +
-                                 token_group_idx * x * TOKEN_PER_GROUP);
-              });
-        }
-      }
-    }
-
-    for (int token_group_idx = 0; token_group_idx < group_num;
-         ++token_group_idx) {
-      vec_op::unroll_loop<int, TOKEN_PER_GROUP>(
-          [&group_accums, logits, scale, token_group_idx](int token_idx) {
-            float dot_v =
-                group_accums[token_group_idx]
-                    .template reduce_sub_sum<qk_acc_vec_type::get_elem_num() /
-                                             TOKEN_PER_GROUP>(token_idx);
-            logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
-                dot_v * scale;
-          });
-    }
-  }
-};
-
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
-          int HEAD_PARTITION_SIZE, typename acc_t>
-FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
-                                   acc_t&& acc) {
-  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
-  constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
-  static_assert(BLOCK_SIZE == ELEM_NUM);
-  vec_op::FP32Vec16 prob_vec(prob);
-
-  vec_op::unroll_loop<int, HEAD_PARTITION_SIZE>([&](int head_elem_idx) {
-    v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
-    vec_op::FP32Vec16 fp32_v_vec(v_vec);
-    acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
-  });
-}
-};  // namespace
-
-// Paged attention v1
-namespace {
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
-struct paged_attention_v1_impl {
-  static void call(
-      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size, block_size]
-      const int num_kv_heads, const float scale,
-      const int* __restrict__ block_tables,  // [num_seqs,
-                                             // max_num_blocks_per_seq]
-      const int* __restrict__ seq_lens,      // [num_seqs]
-      const int max_num_blocks_per_seq,
-      const float* __restrict__ alibi_slopes,  // [num_heads]
-      const int q_stride, const int kv_block_stride, const int kv_head_stride,
-      const int num_seqs, const int num_heads) {
-    constexpr int x = 16 / sizeof(scalar_t);
-    const int num_queries_per_kv = num_heads / num_kv_heads;
-
-    static_assert(BLOCK_SIZE == 16);
-
-    int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
-    int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
-    TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
-
-    const int parallel_work_item_num = omp_get_max_threads();
-
-    size_t logits_bytes =
-        parallel_work_item_num * max_seq_len_padded * sizeof(float);
-    float* logits = (float*)std::aligned_alloc(
-        64, logits_bytes);  // Cacheline alignment for each context token.
-                            // [parallel_work_item_num, max_seq_len_padded]
-
-#pragma omp parallel for collapse(2) schedule(dynamic, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        int seq_len = seq_lens[seq_idx];
-        const int* seq_block_table =
-            block_tables + max_num_blocks_per_seq * seq_idx;
-        const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        const int64_t kv_head_idx = head_idx / num_queries_per_kv;
-        const scalar_t* __restrict__ q_vec_ptr =
-            q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-        const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
-        float* __restrict__ thread_block_logits =
-            logits + omp_get_thread_num() * max_seq_len_padded;
-
-        // Compute logits
-        for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-          const int64_t physical_block_idx = seq_block_table[block_idx];
-          const scalar_t* __restrict__ k_block_cache_ptr =
-              k_cache + physical_block_idx * kv_block_stride +
-              kv_head_idx * kv_head_stride;
-          float* __restrict__ head_block_logits =
-              thread_block_logits + block_idx * BLOCK_SIZE;
-
-          reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
-              q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
-              block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
-        }
-
-        // Compute softmax
-        if (alibi_slopes) {
-          reduceSoftmaxAlibi(thread_block_logits, seq_len,
-                             block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
-                             seq_len);
-        } else {
-          reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
-        }
-
-        // Compute value
-        constexpr int head_elem_num_per_partition = 16;
-        constexpr int head_partition_num =
-            HEAD_SIZE / head_elem_num_per_partition;
-        for (int head_part_idx = 0; head_part_idx < head_partition_num;
-             ++head_part_idx) {
-          vec_op::FP32Vec16 accums[head_elem_num_per_partition];
-          scalar_t* __restrict__ out_ptr =
-              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
-              head_part_idx * head_elem_num_per_partition;
-          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-            const int64_t physical_block_idx = seq_block_table[block_idx];
-            const float* __restrict__ prob_vec_ptr =
-                thread_block_logits + block_idx * BLOCK_SIZE;
-            const scalar_t* __restrict__ v_block_cache_ptr =
-                v_cache + physical_block_idx * kv_block_stride +
-                kv_head_idx * kv_head_stride +
-                BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-            reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
-                             head_elem_num_per_partition>(
-                prob_vec_ptr, v_block_cache_ptr, accums);
-
-            if (block_idx != block_num - 1) {
-              const int64_t next_physical_block_idx =
-                  seq_block_table[block_idx + 1];
-              const scalar_t* __restrict__ next_v_block_cache_ptr =
-                  v_cache + next_physical_block_idx * kv_block_stride +
-                  kv_head_idx * kv_head_stride +
-                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-              vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                  [&](int head_elem_idx) {
-                    if (head_elem_idx % 2 == 0) {
-                      vec_op::prefetch(next_v_block_cache_ptr +
-                                       BLOCK_SIZE * head_elem_idx);
-                    }
-                  });
-            }
-          }
-
-          vec_op::unroll_loop<int, head_elem_num_per_partition>(
-              [&](int head_elem_idx) {
-                float value = accums[head_elem_idx].reduce_sum();
-                vec_op::storeFP32(value, out_ptr + head_elem_idx);
-              });
-        }
-      }
-    }
-    std::free(logits);
-  }
-};
-
-#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
-  paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call(                     \
-      out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
-      block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,                  \
-      alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs,   \
-      num_heads);
-
-template <typename T, int BLOCK_SIZE>
-void paged_attention_v1_impl_launcher(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const std::optional<torch::Tensor>& alibi_slopes) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  switch (head_size) {
-    case 32:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
-      break;
-    case 64:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
-      break;
-    case 80:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
-      break;
-    case 96:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
-      break;
-    case 112:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
-      break;
-    case 128:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
-      break;
-    case 192:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
-      break;
-    case 256:
-      LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE)                               \
-  paged_attention_v1_impl_launcher<T, BLOCK_SIZE>(                           \
-      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
-      seq_lens, max_seq_len, alibi_slopes);
-
-#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_V1_KERNEL_LAUNCHER(T, 16);                             \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-}  // namespace
-
-void paged_attention_v1(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(blocksparse_vert_stride <= 1,
-              "CPU backend does not support blocksparse attention yet.");
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
-                               [&] {
-                                 CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
-                                 CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
-                                 CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
-                               });
-}
-
-// Paged attention v2
-namespace {
-template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
-struct paged_attention_v2_impl {
-  static void call(
-      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
-      float* __restrict__ exp_sums,          // [num_seqs, num_heads,
-                                             // max_num_partitions]
-      float* __restrict__ max_logits,        // [num_seqs, num_heads,
-                                             // max_num_partitions]
-      scalar_t* __restrict__ tmp_out,        // [num_seqs, num_heads,
-                                             // max_num_partitions, head_size]
-      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
-      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size/x, block_size, x]
-      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                             // head_size, block_size]
-      const int num_kv_heads, const float scale,
-      const int* __restrict__ block_tables,  // [num_seqs,
-                                             // max_num_blocks_per_seq]
-      const int* __restrict__ seq_lens,      // [num_seqs]
-      const int max_num_blocks_per_seq,
-      const float* __restrict__ alibi_slopes,  // [num_heads]
-      const int q_stride, const int kv_block_stride, const int kv_head_stride,
-      const int num_seqs, const int num_heads, const int max_num_partitions) {
-    constexpr int x = 16 / sizeof(scalar_t);
-    const int num_queries_per_kv = num_heads / num_kv_heads;
-
-    static_assert(BLOCK_SIZE == 16);
-    static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
-    static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
-
-#pragma omp parallel for collapse(3) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int partition_idx = 0; partition_idx < max_num_partitions;
-           ++partition_idx) {
-        for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-          const int seq_len = seq_lens[seq_idx];
-          const int start_token_idx = partition_idx * PARTITION_SIZE;
-
-          if (start_token_idx >= seq_len) continue;
-
-          const int partition_num =
-              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-          const bool no_reduce = (partition_num == 1);
-          const int token_num =
-              (std::min(seq_len, start_token_idx + PARTITION_SIZE) -
-               start_token_idx);
-          const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
-          const int last_block_token_num =
-              token_num - (block_num - 1) * BLOCK_SIZE;
-          const int* seq_block_table = block_tables +
-                                       max_num_blocks_per_seq * seq_idx +
-                                       start_token_idx / BLOCK_SIZE;
-          const int64_t kv_head_idx = head_idx / num_queries_per_kv;
-          const scalar_t* __restrict__ q_vec_ptr =
-              q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-
-          float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
-
-          // Compute logits
-          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-            const int64_t physical_block_idx = seq_block_table[block_idx];
-            const scalar_t* __restrict__ k_block_cache_ptr =
-                k_cache + physical_block_idx * kv_block_stride +
-                kv_head_idx * kv_head_stride;
-            float* __restrict__ head_block_logits =
-                logits + block_idx * BLOCK_SIZE;
-
-            reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
-                q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
-                block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
-          }
-
-          std::pair<float, float> max_and_sum;
-          if (alibi_slopes) {
-            max_and_sum = reduceSoftmaxAlibi(
-                logits, token_num, block_num * BLOCK_SIZE,
-                alibi_slopes[head_idx], start_token_idx, seq_len);
-          } else {
-            max_and_sum =
-                reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
-          }
-
-          auto&& [max_logit, exp_sum] = max_and_sum;
-
-          scalar_t* __restrict__ output_buffer = nullptr;
-          if (!no_reduce) {
-            auto idx = seq_idx * num_heads * max_num_partitions +
-                       head_idx * max_num_partitions + partition_idx;
-            max_logits[idx] = max_logit;
-            exp_sums[idx] = exp_sum;
-            output_buffer =
-                tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-                head_idx * max_num_partitions * HEAD_SIZE +
-                partition_idx * HEAD_SIZE;
-          } else {
-            output_buffer =
-                out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
-          }
-
-          // Compute value
-          constexpr int head_elem_num_per_partition = 16;
-          constexpr int head_partition_num =
-              HEAD_SIZE / head_elem_num_per_partition;
-          for (int head_part_idx = 0; head_part_idx < head_partition_num;
-               ++head_part_idx) {
-            vec_op::FP32Vec16 accums[head_elem_num_per_partition];
-            scalar_t* __restrict__ out_ptr =
-                output_buffer + head_part_idx * head_elem_num_per_partition;
-            for (int block_idx = 0; block_idx < block_num; ++block_idx) {
-              const int64_t physical_block_idx = seq_block_table[block_idx];
-              const float* __restrict__ prob_vec_ptr =
-                  logits + block_idx * BLOCK_SIZE;
-              const scalar_t* __restrict__ v_block_cache_ptr =
-                  v_cache + physical_block_idx * kv_block_stride +
-                  kv_head_idx * kv_head_stride +
-                  BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-              reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
-                               head_elem_num_per_partition>(
-                  prob_vec_ptr, v_block_cache_ptr, accums);
-
-              if (block_idx != block_num - 1) {
-                const int64_t next_physical_block_idx =
-                    seq_block_table[block_idx + 1];
-                const scalar_t* __restrict__ next_v_block_cache_ptr =
-                    v_cache + next_physical_block_idx * kv_block_stride +
-                    kv_head_idx * kv_head_stride +
-                    BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
-                vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                    [&](int head_elem_idx) {
-                      if (head_elem_idx % 2 == 0) {
-                        vec_op::prefetch(next_v_block_cache_ptr +
-                                         BLOCK_SIZE * head_elem_idx);
-                      }
-                    });
-              }
-            }
-
-            vec_op::unroll_loop<int, head_elem_num_per_partition>(
-                [&](int head_elem_idx) {
-                  float value = accums[head_elem_idx].reduce_sum();
-                  vec_op::storeFP32(value, out_ptr + head_elem_idx);
-                });
-          }
-        }
-      }
-    }
-
-    // Rescale partition softmax and store the factors to exp_sums
-#pragma omp parallel for collapse(2) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        const int seq_len = seq_lens[seq_idx];
-        const int partition_num =
-            (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
-        if (partition_num == 1) continue;
-
-        reducePartitionSoftmax(
-            max_logits + seq_idx * num_heads * max_num_partitions +
-                head_idx * max_num_partitions,
-            exp_sums + seq_idx * num_heads * max_num_partitions +
-                head_idx * max_num_partitions,
-            partition_num);
-      }
-    }
-
-    // Reduce values
-    using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
-    static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
-    constexpr int head_elem_num_per_group =
-        16;  // Note: didn't align with the cacheline size, due to some
-             // HEAD_SIZE didn't align with 64 bytes
-    static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
-    constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
-    const float* __restrict__ rescale_factors = exp_sums;
-#pragma omp parallel for collapse(3) schedule(static, 1)
-    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
-      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-        for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
-          const int seq_len = seq_lens[seq_idx];
-          const int partition_num =
-              (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
-          if (partition_num == 1) continue;
-
-          const float* __restrict__ seq_head_rescale_factors =
-              rescale_factors + seq_idx * num_heads * max_num_partitions +
-              head_idx * max_num_partitions;
-          const scalar_t* __restrict__ seq_head_tmp_out =
-              tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-              head_idx * max_num_partitions * HEAD_SIZE +
-              group_idx * head_elem_num_per_group;
-          scalar_t* __restrict__ seq_head_output =
-              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
-              group_idx * head_elem_num_per_group;
-
-          vec_op::FP32Vec16 acc;
-          for (int i = 0; i < partition_num; ++i) {
-            vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
-            v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
-            vec_op::FP32Vec16 fp32_value(value);
-            acc = acc + fp32_value * rescale_factor;
-          }
-          v_load_vec_type cast_acc(acc);
-          cast_acc.save(seq_head_output);
-        }
-      }
-    }
-  }
-};
-
-#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                 \
-  paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call(   \
-      out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr,         \
-      key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
-      seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,      \
-      kv_block_stride, kv_head_stride, num_seqs, num_heads,                  \
-      max_num_partitions);
-
-template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
-void paged_attention_v2_impl_launcher(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-  int max_num_partitions = exp_sums.size(-1);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
-  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
-  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  switch (head_size) {
-    case 32:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
-      break;
-    case 64:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
-      break;
-    case 80:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
-      break;
-    case 96:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
-      break;
-    case 112:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
-      break;
-    case 128:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
-      break;
-    case 192:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
-      break;
-    case 256:
-      LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE)                              \
-  paged_attention_v2_impl_launcher<T, BLOCK_SIZE>(                          \
-      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-      num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
-      alibi_slopes);
-
-#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_V2_KERNEL_LAUNCHER(T, 16);                             \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-}  // namespace
-
-void paged_attention_v2(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(blocksparse_vert_stride <= 1,
-              "CPU backend does not support blocksparse attention yet.");
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
-                               [&] {
-                                 CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
-                                 CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
-                                 CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
-                               });
-}
\ No newline at end of file
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
deleted file mode 100644
index 69f6d06e3c96..000000000000
--- a/csrc/cpu/cache.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-#include <map>
-#include <vector>
-
-#include "cpu_types.hpp"
-
-#if defined(__x86_64__)
-  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
-#else
-  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
-#endif
-
-namespace {
-template <typename scalar_t>
-void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
-                          std::vector<torch::Tensor> const& value_caches,
-                          const torch::Tensor& mapping_pairs,
-                          const int element_num_per_block,
-                          const int layer_num) {
-  const size_t pair_num = mapping_pairs.size(0);
-  const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
-#pragma omp parallel for collapse(2)
-  for (int layer = 0; layer < layer_num; ++layer) {
-    for (size_t pair = 0; pair < pair_num; ++pair) {
-      int64_t source_offset =
-          element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
-      int64_t target_offset =
-          element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
-      scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
-      scalar_t* source_ptr = key_cache_ptr + source_offset;
-      scalar_t* target_ptr = key_cache_ptr + target_offset;
-      std::memcpy(target_ptr, source_ptr, block_bytes);
-
-      scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
-      source_ptr = value_cache_ptr + source_offset;
-      target_ptr = value_cache_ptr + target_offset;
-      std::memcpy(target_ptr, source_ptr, block_bytes);
-    }
-  }
-}
-
-template <typename scalar_t>
-void reshape_and_cache_cpu_impl(
-    const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
-    scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
-    const int64_t* __restrict__ slot_mapping, const int num_tokens,
-    const int key_stride, const int value_stride, const int num_heads,
-    const int head_size, const int block_size, const int x) {
-  const int block_elem_num = num_heads * head_size * block_size;
-
-#pragma omp parallel for collapse(2)
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
-      const int64_t slot_idx = slot_mapping[token_idx];
-      if (slot_idx >= 0) {
-        int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
-        int src_value_head_idx =
-            token_idx * value_stride + head_idx * head_size;
-        const scalar_t* src_key_head_ptr = key + src_key_head_idx;
-        const scalar_t* src_value_head_ptr = value + src_value_head_idx;
-        const int64_t block_index = slot_idx / block_size;
-        const int64_t block_offset = slot_idx % block_size;
-        scalar_t* target_key_head_ptr = key_cache +
-                                        block_elem_num * block_index +
-                                        head_idx * block_size * head_size;
-        scalar_t* target_value_head_ptr = value_cache +
-                                          block_elem_num * block_index +
-                                          head_idx * block_size * head_size;
-
-        for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) {
-          const int64_t target_offset =
-              src_key_idx * block_size + block_offset * x;
-          for (int i = 0; i < x; ++i) {
-            target_key_head_ptr[target_offset + i] =
-                src_key_head_ptr[src_key_idx + i];
-          }
-        }
-
-        for (int src_value_idx = 0; src_value_idx < head_size;
-             ++src_value_idx) {
-          const int64_t target_offset =
-              src_value_idx * block_size + block_offset;
-          target_value_head_ptr[target_offset] =
-              src_value_head_ptr[src_value_idx];
-        }
-      }
-    }
-  }
-}
-};  // namespace
-
-template <typename scalar_t>
-void concat_and_cache_mla_cpu_impl(
-    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
-    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                      // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int num_tokens,                      //
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size                       //
-) {
-#pragma omp parallel for
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    const int64_t slot_idx = slot_mapping[token_idx];
-    // NOTE: slot_idx can be -1 if the token is padded
-    if (slot_idx < 0) {
-      continue;
-    }
-    const int64_t block_idx = slot_idx / block_size;
-    const int64_t block_offset = slot_idx % block_size;
-
-    auto copy = [&](const scalar_t* __restrict__ src,
-                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
-                    int size, int offset) {
-      for (int i = 0; i < size; i++) {
-        const int64_t src_idx = token_idx * src_stride + i;
-        const int64_t dst_idx =
-            block_idx * block_stride + block_offset * entry_stride + i + offset;
-        dst[dst_idx] = src[src_idx];
-      }
-    };
-
-    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-  }
-}
-
-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping) {
-  unsigned num_layers = key_caches.size();
-  TORCH_CHECK(num_layers == value_caches.size());
-  if (num_layers == 0) {
-    return;
-  }
-
-  const int element_num_per_block = key_caches[0][0].numel();
-  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
-    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
-    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
-                                   element_num_per_block, num_layers);
-    CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
-  });
-}
-
-void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
-                       torch::Tensor& key_cache, torch::Tensor& value_cache,
-                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype,
-                       torch::Tensor& k_scale, torch::Tensor& v_scale) {
-  int num_tokens = key.size(0);
-  int num_heads = key.size(1);
-  int head_size = key.size(2);
-  int block_size = key_cache.size(3);
-  int x = key_cache.size(4);
-
-  int key_stride = key.stride(0);
-  int value_stride = value.stride(0);
-
-  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
-    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
-    reshape_and_cache_cpu_impl<scalar_t>(
-        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
-        num_heads, head_size, block_size, x);
-    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
-  });
-}
-
-void concat_and_cache_mla(
-    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-  TORCH_CHECK(kv_cache_dtype != "fp8");
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  VLLM_DISPATCH_FLOATING_TYPES(
-      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
-        concat_and_cache_mla_cpu_impl<scalar_t>(
-            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
-            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
-            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
-            kv_lora_rank, pe_dim, block_size);
-        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
-      });
-}
-
-void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
-                 const torch::Tensor& block_mapping) {
-  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
-}
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
new file mode 100644
index 000000000000..50f17c758c14
--- /dev/null
+++ b/csrc/cpu/cpu_attn.cpp
@@ -0,0 +1,249 @@
+#include "cpu_attn_vec.hpp"
+#include "cpu_attn_vec16.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu_attn_amx.hpp"
+  #define AMX_DISPATCH(...)                                                   \
+    case cpu_attention::ISA::AMX: {                                           \
+      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
+                                                     scalar_t, head_dim>;     \
+      return __VA_ARGS__();                                                   \
+    }
+#else
+  #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
+#endif
+
+#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
+  case HEAD_DIM: {                            \
+    constexpr size_t head_dim = HEAD_DIM;     \
+    return __VA_ARGS__();                     \
+  }
+
+#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...)           \
+  [&] {                                                         \
+    switch (HEAD_DIM) {                                         \
+      CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__)                  \
+      CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__)                  \
+      default: {                                                \
+        TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
+                               std::to_string(HEAD_DIM));       \
+      }                                                         \
+    }                                                           \
+  }()
+
+#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...)                                 \
+  [&] {                                                                       \
+    switch (ISA_TYPE) {                                                       \
+      AMX_DISPATCH(__VA_ARGS__)                                               \
+      case cpu_attention::ISA::VEC: {                                         \
+        using attn_impl =                                                     \
+            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
+                                         head_dim>;                           \
+        return __VA_ARGS__();                                                 \
+      }                                                                       \
+      case cpu_attention::ISA::VEC16: {                                       \
+        using attn_impl =                                                     \
+            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
+                                         head_dim>;                           \
+        return __VA_ARGS__();                                                 \
+      }                                                                       \
+      default: {                                                              \
+        TORCH_CHECK(false, "Invalid CPU attention ISA type.");                \
+      }                                                                       \
+    }                                                                         \
+  }()
+
+torch::Tensor get_scheduler_metadata(
+    const int64_t num_req, const int64_t num_heads_q,
+    const int64_t num_heads_kv, const int64_t head_dim,
+    const torch::Tensor& seq_lens, at::ScalarType dtype,
+    const torch::Tensor& query_start_loc, const bool casual,
+    const int64_t window_size, const std::string& isa_hint,
+    const bool enable_kv_split) {
+  cpu_attention::ISA isa;
+  if (isa_hint == "amx") {
+    isa = cpu_attention::ISA::AMX;
+  } else if (isa_hint == "vec") {
+    isa = cpu_attention::ISA::VEC;
+  } else if (isa_hint == "vec16") {
+    isa = cpu_attention::ISA::VEC16;
+  } else {
+    TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
+  }
+
+  cpu_attention::AttentionScheduler::ScheduleInput input;
+  input.num_reqs = num_req;
+  input.num_heads_q = num_heads_q;
+  input.num_heads_kv = num_heads_kv;
+  input.head_dim = head_dim;
+  input.query_start_loc = query_start_loc.data_ptr<int32_t>();
+  input.seq_lens = seq_lens.data_ptr<int32_t>();
+  if (window_size != -1) {
+    input.left_sliding_window_size = window_size - 1;
+    if (casual) {
+      input.right_sliding_window_size = 0;
+    } else {
+      input.right_sliding_window_size = window_size - 1;
+    }
+  } else {
+    input.left_sliding_window_size = -1;
+    if (casual) {
+      input.right_sliding_window_size = 0;
+    } else {
+      input.right_sliding_window_size = -1;
+    }
+  }
+  input.casual = casual;
+  input.isa = isa;
+  input.enable_kv_split = enable_kv_split;
+  TORCH_CHECK(casual, "Only supports casual mask for now.");
+
+  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
+    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
+      CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
+        input.elem_size = sizeof(scalar_t);
+        input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
+        input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
+        input.output_buffer_elem_size =
+            sizeof(attn_impl::partial_output_buffer_t);
+        input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
+        input.kv_block_alignment = attn_impl::BlockSizeAlignment;
+      });
+    });
+  });
+
+  cpu_attention::AttentionScheduler scheduler;
+  torch::Tensor metadata = scheduler.schedule(input);
+  return metadata;
+}
+
+void cpu_attn_reshape_and_cache(
+    const torch::Tensor& key,    // [token_num, head_num, head_size]
+    const torch::Tensor& value,  // [token_num, head_num, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    torch::Tensor&
+        value_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    const torch::Tensor& slot_mapping, const std::string& isa) {
+  TORCH_CHECK_EQ(key.dim(), 3);
+  TORCH_CHECK_EQ(value.dim(), 3);
+  TORCH_CHECK_EQ(key_cache.dim(), 4);
+  TORCH_CHECK_EQ(value_cache.dim(), 4);
+  TORCH_CHECK_EQ(key.stride(2), 1);
+  TORCH_CHECK_EQ(value.stride(2), 1);
+
+  const int64_t token_num = key.size(0);
+  const int64_t key_token_num_stride = key.stride(0);
+  const int64_t value_token_num_stride = value.stride(0);
+  const int64_t head_num = value.size(1);
+  const int64_t key_head_num_stride = key.stride(1);
+  const int64_t value_head_num_stride = value.stride(1);
+  const int64_t num_blocks = key_cache.size(0);
+  const int64_t num_blocks_stride = key_cache.stride(0);
+  const int64_t cache_head_num_stride = key_cache.stride(1);
+  const int64_t block_size = key_cache.size(2);
+  const int64_t block_size_stride = key_cache.stride(2);
+  const int64_t head_dim = key.size(-1);
+
+  cpu_attention::ISA isa_tag = [&]() {
+    if (isa == "amx") {
+      return cpu_attention::ISA::AMX;
+    } else if (isa == "vec") {
+      return cpu_attention::ISA::VEC;
+    } else if (isa == "vec16") {
+      return cpu_attention::ISA::VEC16;
+    } else {
+      TORCH_CHECK(false, "Invalid ISA type: " + isa);
+    }
+  }();
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
+        CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
+          CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
+            attn_impl::reshape_and_cache(
+                key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+                key_cache.data_ptr<scalar_t>(),
+                value_cache.data_ptr<scalar_t>(),
+                slot_mapping.data_ptr<int64_t>(), token_num,
+                key_token_num_stride, value_token_num_stride, head_num,
+                key_head_num_stride, value_head_num_stride, num_blocks,
+                num_blocks_stride, cache_head_num_stride, block_size,
+                block_size_stride);
+          });
+        });
+      });
+}
+
+void cpu_attention_with_kv_cache(
+    const torch::Tensor& query,  // [num_tokens, num_heads, head_size]
+    const torch::Tensor&
+        key_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
+    const torch::Tensor&
+        value_cache,        // [num_blocks, num_kv_heads, block_size, head_size]
+    torch::Tensor& output,  // [num_tokens, num_heads, head_size]
+    const torch::Tensor& query_start_loc,  // [num_tokens + 1]
+    const torch::Tensor& seq_lens,         // [num_tokens]
+    const double scale, const bool causal,
+    const std::optional<torch::Tensor>& alibi_slopes,  // [num_heads]
+    const int64_t sliding_window_left, const int64_t sliding_window_right,
+    const torch::Tensor& block_table,  // [num_tokens, max_block_num]
+    const double softcap, const torch::Tensor& scheduler_metadata,
+    const std::optional<torch::Tensor>& s_aux  // [num_heads]
+) {
+  TORCH_CHECK_EQ(query.dim(), 3);
+  TORCH_CHECK_EQ(query.stride(2), 1);
+  TORCH_CHECK_EQ(key_cache.dim(), 4);
+  TORCH_CHECK_EQ(value_cache.dim(), 4);
+
+  cpu_attention::AttentionInput input;
+  input.metadata = reinterpret_cast<cpu_attention::AttentionMetadata*>(
+      scheduler_metadata.data_ptr());
+  input.num_tokens = query.size(0);
+  input.num_heads = query.size(1);
+  input.num_kv_heads = key_cache.size(1);
+  input.block_size = key_cache.size(2);
+  input.query = query.data_ptr();
+  input.query_num_tokens_stride = query.stride(0);
+  input.query_num_heads_stride = query.stride(1);
+  input.cache_num_blocks_stride = key_cache.stride(0);
+  input.cache_num_kv_heads_stride = key_cache.stride(1);
+  input.blt_num_tokens_stride = block_table.stride(0);
+  input.key_cache = key_cache.data_ptr();
+  input.value_cache = value_cache.data_ptr();
+  input.output = output.data_ptr();
+  input.query_start_loc = query_start_loc.data_ptr<int32_t>();
+  input.seq_lens = seq_lens.data_ptr<int32_t>();
+  input.block_table = block_table.data_ptr<int32_t>();
+  input.alibi_slopes =
+      alibi_slopes.has_value() ? alibi_slopes->data_ptr<float>() : nullptr;
+  // For now sink must be bf16
+  input.s_aux = s_aux.has_value() ? s_aux->data_ptr<c10::BFloat16>() : nullptr;
+  input.scale = scale;
+  input.causal = causal;
+  input.sliding_window_left = sliding_window_left;
+  input.sliding_window_right = sliding_window_right;
+  if (input.causal) {
+    // to make boundary calculation easier
+    input.sliding_window_right = 0;
+  }
+  float softcap_fp32 = softcap;
+  input.softcap = softcap_fp32;
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
+        CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
+          CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
+            TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
+            cpu_attention::AttentionMainLoop<attn_impl> mainloop;
+            mainloop(&input);
+          });
+        });
+      });
+}
diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp
new file mode 100644
index 000000000000..8da458b99119
--- /dev/null
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -0,0 +1,511 @@
+#ifndef CPU_ATTN_AMX_HPP
+#define CPU_ATTN_AMX_HPP
+
+#include "cpu_attn_impl.hpp"
+
+namespace cpu_attention {
+namespace {
+// AMX specific
+constexpr static int64_t AMX_TILE_ROW_BYTES = 64;
+constexpr static int64_t AMX_TILE_ROW_NUM = 16;
+constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM;
+
+typedef struct __tile_config {
+  uint8_t palette_id = 1;
+  uint8_t start_row = 0;
+  uint8_t reserved_0[14] = {0};
+  uint16_t colsb[16] = {0};
+  uint8_t rows[16] = {0};
+} __tilecfg;
+
+// 2-2-4 pattern, for 16 < m <= 32
+// TILE 0, 1: load A matrix, row num should be 16, m - 16
+// TILE 2, 3: load B matrix, row num should be 16
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
+// - 16
+template <typename kv_cache_t>
+class TileGemm224 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                void* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
+  }
+};
+
+template <>
+class TileGemm224<c10::BFloat16> {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                c10::BFloat16* __restrict__ a_tile,
+                                c10::BFloat16* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    const int32_t k_times =
+        dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    c10::BFloat16* __restrict__ a_tile_0 = a_tile;
+    c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM;
+    const int64_t a_tile_stride = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return AMX_TILE_ROW_BYTES;
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return lda * sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
+    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // k_cache is prepacked
+        return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // v_cache is prepacked
+        return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    // k_cache, v_cache are prepacked
+    const int32_t b_tile_stride = AMX_TILE_ROW_BYTES;
+
+    // logits_buffer, output_buffer are not prepacked
+    float* __restrict__ c_tile_4 = c_tile;
+    float* __restrict__ c_tile_5 =
+        c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float);
+    float* __restrict__ c_tile_6 = c_tile + AMX_TILE_ROW_NUM * ldc;
+    float* __restrict__ c_tile_7 =
+        c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float);
+    const int32_t c_tile_stride = ldc * sizeof(float);
+
+    if (accum_c) {
+      _tile_loadd(4, c_tile_4, c_tile_stride);
+      _tile_loadd(5, c_tile_5, c_tile_stride);
+      _tile_loadd(6, c_tile_6, c_tile_stride);
+      _tile_loadd(7, c_tile_7, c_tile_stride);
+    } else {
+      _tile_zero(4);
+      _tile_zero(5);
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+      _tile_dpbf16ps(4, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+      _tile_dpbf16ps(5, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_dpbf16ps(6, 1, 2);
+      _tile_dpbf16ps(7, 1, 3);
+
+      // update ptrs
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // Q buffer is prepacked
+        a_tile_0 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // P buffer is not prepacked
+        a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+      b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    _tile_stored(4, c_tile_4, c_tile_stride);
+    _tile_stored(5, c_tile_5, c_tile_stride);
+    _tile_stored(6, c_tile_6, c_tile_stride);
+    _tile_stored(7, c_tile_7, c_tile_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    const int32_t m_0 = AMX_TILE_ROW_NUM;
+    const int32_t m_1 = m - AMX_TILE_ROW_NUM;
+    config.rows[0] = m_0;
+    config.rows[1] = m_1;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = m_0;
+    config.rows[5] = m_0;
+    config.rows[6] = m_1;
+    config.rows[7] = m_1;
+    _tile_loadconfig(&config);
+  }
+};
+
+// 1-2-2 pattern, for 0 < m <= 16
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
+// m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
+// num should be 16
+// TILE 6, 7, (6, 7): store results C matrix, row num should be
+// m
+template <typename kv_cache_t>
+class TileGemm122 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                void* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
+  }
+};
+
+template <>
+class TileGemm122<c10::BFloat16> {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                c10::BFloat16* __restrict__ a_tile,
+                                c10::BFloat16* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    c10::BFloat16* __restrict__ a_tile_0 = a_tile;
+    c10::BFloat16* __restrict__ a_tile_1 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return a_tile + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return a_tile + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    const int64_t a_tile_stride = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // q_buffer is prepacked
+        return AMX_TILE_ROW_BYTES;
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // logits_buffer is row-major
+        return lda * sizeof(c10::BFloat16);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_tile;
+    c10::BFloat16* __restrict__ b_tile_3 = [&]() {
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // k_cache is prepacked
+        return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // v_cache is prepacked
+        return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
+      } else {
+        TORCH_CHECK(false, "Unreachable");
+      }
+    }();
+    c10::BFloat16* __restrict__ b_tile_4 =
+        b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    c10::BFloat16* __restrict__ b_tile_5 =
+        b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    int64_t b_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_6 = c_tile;
+    float* __restrict__ c_tile_7 = c_tile + AMX_TILE_ROW_BYTES / sizeof(float);
+    int64_t c_stride = ldc * sizeof(float);
+
+    const int32_t k_times =
+        dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    const int32_t k_group_times = k_times / 2;
+    const bool has_tail = (k_times % 2 == 1);
+
+    if (accum_c) {
+      _tile_loadd(6, c_tile_6, c_stride);
+      _tile_loadd(7, c_tile_7, c_stride);
+    } else {
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_group_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_stream_loadd(4, b_tile_4, b_stride);
+      _tile_dpbf16ps(6, 1, 4);
+      _tile_stream_loadd(5, b_tile_5, b_stride);
+      _tile_dpbf16ps(7, 1, 5);
+
+      // update ptrs
+      if constexpr (phase == AttentionGemmPhase::QK) {
+        // Q buffer is prepacked
+        a_tile_0 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      } else if constexpr (phase == AttentionGemmPhase::PV) {
+        // P buffer is not prepacked
+        a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+        a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      }
+      b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    if (has_tail) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+    }
+
+    _tile_stored(6, c_tile_6, c_stride);
+    _tile_stored(7, c_tile_7, c_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    config.rows[0] = m;
+    config.rows[1] = m;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = AMX_TILE_ROW_NUM;
+    config.rows[5] = AMX_TILE_ROW_NUM;
+    config.rows[6] = m;
+    config.rows[7] = m;
+    _tile_loadconfig(&config);
+  }
+};
+}  // namespace
+
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = scalar_t;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = scalar_t;
+
+  constexpr static int64_t BlockSizeAlignment =
+      AMX_TILE_ROW_BYTES /
+      sizeof(kv_cache_t);  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      2 * (AMX_TILE_ROW_BYTES / 4);  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 32;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::AMX;
+  constexpr static bool scale_on_logits = true;
+
+ public:
+  AttentionImpl() : current_q_head_num_(0) {
+    // Use all columns in AMX tiles
+    vec_op::unroll_loop<int, 8>([&](int i) { amx_tile_config_.colsb[i] = 64; });
+  }
+
+  ~AttentionImpl() { _tile_release(); }
+
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    if (q_head_num > AMX_TILE_ROW_NUM) {
+      if (q_head_num != current_q_head_num_) {
+        current_q_head_num_ = q_head_num;
+        TileGemm224<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+      }
+      attention<TileGemm224<kv_cache_t>> attention_iteration;
+      attention_iteration(CPU_ATTENTION_PARAMS);
+    } else {
+      if (q_head_num != current_q_head_num_) {
+        current_q_head_num_ = q_head_num;
+        TileGemm122<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
+      }
+      attention<TileGemm122<kv_cache_t>> attention_iteration;
+      attention_iteration(CPU_ATTENTION_PARAMS);
+    }
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment * head_dim;
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment * (AMX_TILE_ROW_BYTES / 4);
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return block_size * HeadDimAlignment;
+  }
+
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      scalar_t* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, const float scale) {
+    constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
+    static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
+    constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
+    constexpr int64_t head_elem_num_pre_block =
+        AMX_TILE_ROW_BYTES / sizeof(scalar_t);
+
+    int32_t idx = 0;
+    int8_t* __restrict__ q_buffer_iter = reinterpret_cast<int8_t*>(q_buffer);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num;
+         ++q_num_idx, src += q_num_stride) {
+      scalar_t* __restrict__ src_iter = src;
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv;
+           ++q_head_idx, src_iter += q_head_stride) {
+        vec_op::unroll_loop<int32_t, head_size_block_num>(
+            [&](int32_t head_size_block_idx) {
+              // Use INT8Vec64 for 64 bytes block
+              vec_op::INT8Vec64 vec(src_iter + head_size_block_idx *
+                                                   head_elem_num_pre_block);
+              vec.save(q_buffer_iter + head_size_block_idx * AMX_TILE_BYTES);
+            });
+
+        ++idx;
+        q_buffer_iter += AMX_TILE_ROW_BYTES;
+        if ((idx & (AMX_TILE_ROW_NUM - 1)) == 0) {
+          // head is in another amx tile
+          q_buffer_iter -= AMX_TILE_ROW_NUM * AMX_TILE_ROW_BYTES;
+          q_buffer_iter += head_size_block_num * AMX_TILE_BYTES;
+        }
+      }
+    }
+  }
+
+  // reshape KV to AMX friendly layout
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+    // For AMX 2D tiles, size of each line is 64 bytes
+    constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
+    // For AMX B martix, N always is 16
+    constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
+    constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
+    // For now suppose block_size is divisible by amx_tile_column_num
+    TORCH_CHECK_EQ(block_size % amx_b_tile_k_size, 0);
+
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key
+          // Head elements should be packed as quand-words and stored in token
+          // groups with (quadword_stride/4) tokens
+          constexpr int64_t token_num_per_group = amx_tile_row_size / 4;
+          static_assert(head_dim % (4 / sizeof(scalar_t)) == 0);
+          constexpr int64_t quadword_num = head_dim / (4 / sizeof(scalar_t));
+          const int32_t* key_start_quadword_ptr =
+              reinterpret_cast<const int32_t*>(
+                  key + token_idx * key_token_num_stride +
+                  head_idx * key_head_num_stride);
+          const int64_t group_idx = block_offset / token_num_per_group;
+          const int64_t group_offset = block_offset % token_num_per_group;
+          constexpr int64_t quadword_num_per_group =
+              token_num_per_group * quadword_num;
+          int32_t* key_cache_start_ptr =
+              reinterpret_cast<int32_t*>(key_cache +
+                                         block_idx * num_blocks_stride +
+                                         head_idx * cache_head_num_stride) +
+              group_idx * quadword_num_per_group + group_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; j < quadword_num;
+               i += token_num_per_group, ++j) {
+            key_cache_start_ptr[i] = key_start_quadword_ptr[j];
+          }
+        }
+        {
+          // Write Value
+          // Different from Key, block_size dimension is packed rather than
+          // head_size dimension block_size dimension is packed as quand-words;
+          constexpr int64_t token_num_per_sub_group = 4 / sizeof(scalar_t);
+          const int64_t token_num_per_group = block_size;
+          constexpr int64_t head_elems_per_group = amx_b_tile_n_size;
+          const int64_t group_size = token_num_per_group * head_elems_per_group;
+          // For now suppose head_dim is divisible by amx_b_tile_n_size
+          static_assert(head_dim % head_elems_per_group == 0);
+          constexpr int64_t group_num = head_dim / head_elems_per_group;
+          const int64_t sub_group_idx = block_offset / token_num_per_sub_group;
+          const int64_t sub_group_offset =
+              block_offset % token_num_per_sub_group;
+
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride +
+              sub_group_idx * token_num_per_sub_group * amx_b_tile_n_size +
+              sub_group_offset;
+
+          for (int64_t i = 0; i < group_num; ++i) {
+#pragma GCC unroll head_elems_per_group
+            for (int64_t j = 0, k = 0; j < head_elems_per_group;
+                 ++j, k += token_num_per_sub_group) {
+              value_cache_start_ptr[k] = value_start_ptr[j];
+            }
+            value_start_ptr += head_elems_per_group;
+            value_cache_start_ptr += group_size;
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  alignas(64) __tilecfg amx_tile_config_;
+  int32_t current_q_head_num_;
+};
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
new file mode 100644
index 000000000000..344296528b65
--- /dev/null
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -0,0 +1,2007 @@
+#ifndef CPU_ATTN_HPP
+#define CPU_ATTN_HPP
+
+#include <unistd.h>
+#include <type_traits>
+#include <cstddef>
+
+#if defined(__APPLE__)
+  #include <sys/sysctl.h>
+#endif
+
+#include "cpu_types.hpp"
+#include "scratchpad_manager.h"
+#include "cpu_attn_macros.h"
+
+namespace cpu_attention {
+enum class ISA { AMX, VEC, VEC16 };
+
+template <ISA isa, typename scalar_t, int64_t head_dim>
+class AttentionImpl {};
+
+struct AttentionWorkItemGroup {
+  int32_t req_id;
+  int32_t q_token_id_start;
+  int32_t q_token_num;
+  int32_t kv_split_pos_start;
+  int32_t kv_split_pos_end;
+
+  int64_t total_kv_len;
+  int32_t split_id;
+  int32_t local_split_id;
+
+  AttentionWorkItemGroup(const int32_t req_id, const int32_t q_token_id_start,
+                         const int32_t kv_split_pos_start,
+                         const int32_t kv_split_pos_end)
+      : req_id(req_id),
+        q_token_id_start(q_token_id_start),
+        q_token_num(0),
+        kv_split_pos_start(kv_split_pos_start),
+        kv_split_pos_end(kv_split_pos_end),
+        total_kv_len(0),
+        split_id(-1),
+        local_split_id(0) {}
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << '[' << "req_id: " << req_id << ",\n";
+    ss << "q_token_id_start: " << q_token_id_start << ",\n";
+    ss << "q_token_num: " << q_token_num << ",\n";
+    ss << "kv_split_pos_start: " << kv_split_pos_start << ",\n";
+    ss << "kv_split_pos_end: " << kv_split_pos_end << ",\n";
+    ss << "total_kv_len: " << total_kv_len << ",\n";
+    ss << "split_id: " << split_id << ",\n";
+    ss << "local_split_id: " << local_split_id << ",\n";
+    ss << ']';
+
+    return ss.str();
+  }
+};
+
+struct ReductionWorkItemGroup {
+  int32_t req_id;
+  int32_t q_token_id_start;
+  int32_t q_token_id_num;
+  int32_t split_start_id;
+  int32_t split_num;
+
+  ReductionWorkItemGroup(const int32_t req_id, const int32_t q_token_id_start,
+                         const int32_t q_token_id_num,
+                         const int32_t split_start_id)
+      : req_id(req_id),
+        q_token_id_start(q_token_id_start),
+        q_token_id_num(q_token_id_num),
+        split_start_id(split_start_id),
+        split_num(0) {}
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << '[' << "req_id: " << req_id << ",\n";
+    ss << "q_token_id_start: " << q_token_id_start << ",\n";
+    ss << "q_token_id_num: " << q_token_id_num << ",\n";
+    ss << "split_start_id: " << split_start_id << ",\n";
+    ss << "split_num: " << split_num << ",\n";
+    ss << ']';
+
+    return ss.str();
+  }
+};
+
+struct AttentionMetadata {
+  std::atomic_int64_t counter;
+  char _padding1[56];
+  ISA isa;
+  int32_t workitem_group_num;
+  int32_t reduction_item_num;
+  int32_t reduction_split_num;
+  int32_t thread_num;
+  int32_t effective_thread_num;  // non-zero item num in workitem_num_per_thread
+  int32_t split_kv_q_token_num_threshold;
+  int64_t attention_scratchpad_size_per_thread;
+  int64_t reduction_scratchpad_size_per_kv_head;
+  AttentionWorkItemGroup* workitem_groups_ptr;
+  ReductionWorkItemGroup* reduction_items_ptr;
+  int32_t cu_workitem_num_per_thread[1025] = {
+      0};  // prefix sum of workitem_num_per_thread
+  char _padding2[56];
+
+  AttentionMetadata(ISA isa, int32_t workitem_group_num,
+                    int32_t reduction_item_num, int32_t reduction_split_num,
+                    int32_t split_kv_q_token_num_threshold)
+      : isa(isa),
+        workitem_group_num(workitem_group_num),
+        reduction_item_num(reduction_item_num),
+        reduction_split_num(reduction_split_num),
+        thread_num(omp_get_max_threads()),
+        effective_thread_num(thread_num),
+        split_kv_q_token_num_threshold(split_kv_q_token_num_threshold),
+        attention_scratchpad_size_per_thread(0),
+        reduction_scratchpad_size_per_kv_head(0),
+        workitem_groups_ptr(
+            (AttentionWorkItemGroup*)((char*)this + sizeof(AttentionMetadata))),
+        reduction_items_ptr(
+            (ReductionWorkItemGroup*)((char*)this + sizeof(AttentionMetadata) +
+                                      workitem_group_num *
+                                          sizeof(AttentionWorkItemGroup))),
+        counter(0) {
+    TORCH_CHECK_LE(thread_num, 1024);
+    static_assert(sizeof(AttentionMetadata) % 64 == 0);
+    TORCH_CHECK(reinterpret_cast<size_t>(this) % 64 == 0);
+  }
+
+  void reset_counter() { counter.store(0); }
+
+  int64_t acquire_counter() { return counter++; }
+
+  void print() const {
+    std::stringstream ss;
+    ss << "ISA: ";
+    switch (isa) {
+      case ISA::AMX:
+        ss << "AMX, ";
+        break;
+      case ISA::VEC:
+        ss << "VEC, ";
+        break;
+    }
+    ss << "workitem_group_num: " << workitem_group_num
+       << ", reduction_item_num: " << reduction_item_num
+       << ", reduction_split_num: " << reduction_split_num
+       << ", thread_num: " << thread_num
+       << ", effective_thread_num: " << effective_thread_num
+       << ", attention_scratchpad_size_per_thread: "
+       << attention_scratchpad_size_per_thread
+       << ", reduction_scratchpad_size_per_kv_head: "
+       << reduction_scratchpad_size_per_kv_head << ", workitem groups:\n";
+    for (int32_t i = 0; i < workitem_group_num; ++i) {
+      ss << (workitem_groups_ptr + i)->to_string() << ",\n";
+    }
+
+    ss << "cu_workitem_num_per_thread: [";
+    for (int32_t i = 0; i < thread_num + 1; ++i) {
+      ss << cu_workitem_num_per_thread[i] << ", ";
+    }
+    ss << "]\n";
+
+    ss << "reduction items: \n";
+
+    for (int32_t i = 0; i < reduction_item_num; ++i) {
+      ss << (reduction_items_ptr + i)->to_string() << ",\n";
+    }
+
+    std::printf("%s", ss.str().c_str());
+  }
+};
+
+// Thread attention scratchpad contains:
+//  - Q: q_tile_size * head_dim * q_buffer_elem_size, gather Q heads, especially
+//  for GQA
+//  - Q@K^T: max_num_q_per_iter * k_tile_size * logits_buffer_elem_size, logits
+//  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
+//  * q_tile_size * 4, partial output, max + sum (float)
+// Reduction scratchpad contains:
+//  - flags: bool array to indicate wether the split is finished
+//  - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
+//  - max, sum: 2 * split_num * q_tile_size * 4
+class AttentionScratchPad {
+ public:
+  AttentionScratchPad(int64_t thread_id,
+                      const AttentionMetadata& attention_metadata,
+                      void* scratchpad_ptr)
+      : thread_scratchpad_ptr(
+            static_cast<int8_t*>(scratchpad_ptr) +
+            thread_id *
+                attention_metadata.attention_scratchpad_size_per_thread),
+        reduction_scratchpad_ptr(
+            static_cast<int8_t*>(scratchpad_ptr) +
+            attention_metadata.thread_num *
+                attention_metadata.attention_scratchpad_size_per_thread),
+        reduction_scratchpad_size_per_kv_head(
+            attention_metadata.reduction_scratchpad_size_per_kv_head) {}
+
+  // for attention
+  void update(const int64_t head_dim, const int64_t q_buffer_elem_size,
+              const int64_t logits_buffer_elem_size,
+              const int64_t output_buffer_elem_size,
+              const int64_t max_num_q_per_iter, const int64_t q_head_tile_size,
+              const int64_t kv_tile_size) {
+    int64_t buffer_offset = 0;
+    q_buffer_offset_ = buffer_offset;
+    buffer_offset +=
+        calcu_q_buffer_size(q_head_tile_size, head_dim, q_buffer_elem_size);
+    logits_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_logits_buffer_size(max_num_q_per_iter, kv_tile_size,
+                                              logits_buffer_elem_size);
+    output_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_partial_output_buffer_size(
+        q_head_tile_size, head_dim, output_buffer_elem_size);
+    max_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_partial_output_max_sum_buffer_size(q_head_tile_size);
+    sum_buffer_offset_ = buffer_offset;
+  }
+
+  // for reduction
+  void update(const int32_t kv_head_idx, const int32_t total_split_num,
+              const int64_t head_dim, const int64_t q_head_tile_size,
+              const int64_t output_buffer_elem_size) {
+    int64_t buffer_offset = kv_head_idx * reduction_scratchpad_size_per_kv_head;
+    reduce_flag_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_reduce_flag_buffer_size(total_split_num);
+    reduce_output_buffer_offset_ = buffer_offset;
+    buffer_offset += calcu_reduce_output_buffer_size(
+        total_split_num, q_head_tile_size, head_dim, output_buffer_elem_size);
+    reduce_max_buffer_offset_ = buffer_offset;
+    buffer_offset +=
+        calcu_reduce_max_sum_buffer_size(total_split_num, q_head_tile_size);
+    reduce_sum_buffer_offset_ = buffer_offset;
+  }
+
+  template <typename T>
+  T* get_q_buffer() {
+    return reinterpret_cast<T*>(thread_scratchpad_ptr + q_buffer_offset_);
+  }
+
+  float* get_logits_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr +
+                                    logits_buffer_offset_);
+  }
+
+  float* get_output_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr +
+                                    output_buffer_offset_);
+  }
+
+  float* get_max_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr + max_buffer_offset_);
+  }
+
+  float* get_sum_buffer() {
+    return reinterpret_cast<float*>(thread_scratchpad_ptr + sum_buffer_offset_);
+  }
+
+  volatile bool* get_reduce_flag_buffer() {
+    return reinterpret_cast<volatile bool*>(reduction_scratchpad_ptr +
+                                            reduce_flag_buffer_offset_);
+  }
+
+  float* get_reduce_output_buffer() {
+    return reinterpret_cast<float*>(reduction_scratchpad_ptr +
+                                    reduce_output_buffer_offset_);
+  }
+
+  float* get_reduce_max_buffer() {
+    return reinterpret_cast<float*>(reduction_scratchpad_ptr +
+                                    reduce_max_buffer_offset_);
+  }
+
+  float* get_reduce_sum_buffer() {
+    return reinterpret_cast<float*>(reduction_scratchpad_ptr +
+                                    reduce_sum_buffer_offset_);
+  }
+
+  int64_t get_thread_scratchpad_size() const {
+    return 2 * sum_buffer_offset_ - max_buffer_offset_;
+  }
+
+  int64_t get_reduction_scratchpad_size() const {
+    return 2 * reduce_sum_buffer_offset_ - reduce_max_buffer_offset_;
+  }
+
+ private:
+  static int64_t round_to_64(const int64_t num) {
+    return ((num + 63) >> 6) << 6;
+  }
+
+  static int64_t calcu_q_buffer_size(const int64_t q_tile_size,
+                                     const int64_t head_dim,
+                                     const int64_t elem_size) {
+    return round_to_64(q_tile_size * head_dim * elem_size);
+  }
+
+  static int64_t calcu_logits_buffer_size(const int64_t max_num_q_per_iter,
+                                          const int64_t k_tile_size,
+                                          const int64_t elem_size) {
+    return round_to_64(elem_size * max_num_q_per_iter * k_tile_size);
+  }
+
+  static int64_t calcu_partial_output_buffer_size(const int64_t q_tile_size,
+                                                  const int64_t head_dim,
+                                                  const int64_t elem_size) {
+    return round_to_64(q_tile_size * head_dim * elem_size);
+  }
+
+  static int64_t calcu_partial_output_max_sum_buffer_size(
+      const int64_t q_tile_size) {
+    return round_to_64(q_tile_size * sizeof(float));
+  }
+
+  static int64_t calcu_reduce_flag_buffer_size(const int64_t total_split_num) {
+    return round_to_64(total_split_num * sizeof(bool));
+  }
+
+  static int64_t calcu_reduce_max_sum_buffer_size(
+      const int64_t total_split_num, const int32_t q_head_tile_size) {
+    return round_to_64(total_split_num * q_head_tile_size * sizeof(float));
+  }
+
+  static int64_t calcu_reduce_output_buffer_size(
+      const int64_t total_split_num, const int64_t q_head_tile_size,
+      const int64_t head_dim, const int64_t output_buffer_elem_size) {
+    return round_to_64(total_split_num * q_head_tile_size * head_dim *
+                       output_buffer_elem_size);
+  }
+
+ private:
+  int8_t* thread_scratchpad_ptr;
+  int8_t* reduction_scratchpad_ptr;
+  int64_t reduction_scratchpad_size_per_kv_head;
+  // attention buffers
+  int64_t q_buffer_offset_;
+  int64_t logits_buffer_offset_;
+  int64_t output_buffer_offset_;
+  int64_t max_buffer_offset_;
+  int64_t sum_buffer_offset_;
+  // reduction buffers
+  int64_t reduce_flag_buffer_offset_;
+  int64_t reduce_output_buffer_offset_;
+  int64_t reduce_max_buffer_offset_;
+  int64_t reduce_sum_buffer_offset_;
+};
+
+class AttentionScheduler {
+ public:
+  struct ScheduleInput {
+    int32_t num_reqs;
+    int32_t elem_size;
+    int32_t q_buffer_elem_size;
+    int32_t logits_buffer_elem_size;
+    int32_t output_buffer_elem_size;
+    int32_t num_heads_q;
+    int32_t num_heads_kv;
+    int32_t head_dim;
+    int32_t* query_start_loc;
+    int32_t* seq_lens;
+    int32_t left_sliding_window_size;
+    int32_t right_sliding_window_size;
+    bool casual;
+    cpu_attention::ISA isa;
+    int32_t max_num_q_per_iter;  // max Q head num can be hold in registers
+    int32_t kv_block_alignment;  // context length alignment requirement
+    bool enable_kv_split;
+  };
+
+  static constexpr int32_t MaxQTileIterNum = 128;
+
+  AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
+
+  torch::Tensor schedule(const ScheduleInput& input) const {
+    const bool casual = input.casual;
+    const int32_t thread_num = omp_get_max_threads();
+    const int64_t cache_size = get_available_l2_size();
+    const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
+    const int32_t kv_len_alignment = input.kv_block_alignment;
+    int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
+    const bool use_gqa = (max_num_q_per_iter % q_head_per_kv == 0);
+    if (!use_gqa) {
+      q_head_per_kv = 1;  // fallback to MHA
+    }
+    const int32_t min_split_kv_len =
+        ((max_num_q_per_iter * 4 + kv_len_alignment - 1) / kv_len_alignment) *
+        kv_len_alignment;
+    const int32_t max_num_q_token_per_iter = max_num_q_per_iter / q_head_per_kv;
+    const int64_t default_tile_size = calcu_default_tile_size(
+        cache_size, input.head_dim, input.elem_size, input.q_buffer_elem_size,
+        input.logits_buffer_elem_size, input.output_buffer_elem_size,
+        max_num_q_per_iter, max_num_q_per_iter);
+    const int32_t default_tile_token_num = default_tile_size / q_head_per_kv;
+    const int32_t split_kv_q_token_num_threshold =
+        input.enable_kv_split ? 1 : 0;
+    const int32_t left_sliding_window_size = input.left_sliding_window_size;
+    const int32_t right_sliding_window_size = input.right_sliding_window_size;
+    TORCH_CHECK_LE(split_kv_q_token_num_threshold * q_head_per_kv, 16);
+
+    // get total kv len
+    int64_t total_kv_len = 0;
+    for (int32_t req_id = 0; req_id < input.num_reqs; ++req_id) {
+      const int32_t seq_len = input.seq_lens[req_id];
+      const int32_t q_token_num =
+          input.query_start_loc[req_id + 1] - input.query_start_loc[req_id];
+      const int32_t q_start_pos = (casual ? (seq_len - q_token_num) : 0);
+      const int32_t kv_start_pos = 0;
+      const int32_t kv_end_pos = seq_len;
+
+      for (int32_t token_id = 0; token_id < q_token_num;
+           token_id += max_num_q_token_per_iter) {
+        const int32_t q_tile_token_num =
+            std::min(max_num_q_token_per_iter, q_token_num - token_id);
+        const int32_t q_tile_pos_left = q_start_pos + token_id;
+        const int32_t q_tile_pos_right = q_tile_pos_left + q_tile_token_num;
+        const auto [kv_tile_pos_left, kv_tile_pos_right] = calcu_kv_tile_pos(
+            kv_start_pos, kv_end_pos, q_tile_pos_left, q_tile_pos_right,
+            left_sliding_window_size, right_sliding_window_size);
+        const auto [aligned_kv_tile_pos_left, aligned_kv_tile_pos_right] =
+            align_kv_tile_pos(kv_tile_pos_left, kv_tile_pos_right,
+                              kv_len_alignment);
+
+        int32_t curr_kv_len =
+            aligned_kv_tile_pos_right - aligned_kv_tile_pos_left;
+        total_kv_len += curr_kv_len;
+      }
+    }
+    const int64_t kv_len_per_thread =
+        (((total_kv_len / thread_num) + kv_len_alignment - 1) /
+         kv_len_alignment) *
+        kv_len_alignment * (use_gqa ? input.num_heads_kv : input.num_heads_q);
+    std::vector<AttentionWorkItemGroup> workitems;
+    std::vector<ReductionWorkItemGroup> reduce_workitems;
+    workitems.reserve(1024);
+    reduce_workitems.reserve(1024);
+    std::vector<int32_t> workitem_num_per_thread(thread_num, 0);
+
+    // split tasks
+    int32_t curr_thread_id = 0;
+    int64_t remaining_kv_len = kv_len_per_thread;
+    int32_t cum_split_num = 0;
+    for (int32_t req_id = 0; req_id < input.num_reqs; ++req_id) {
+      const int32_t seq_len = input.seq_lens[req_id];
+      const int32_t q_token_num =
+          input.query_start_loc[req_id + 1] - input.query_start_loc[req_id];
+      const int32_t q_start_pos = (casual ? (seq_len - q_token_num) : 0);
+      const int32_t kv_start_pos = 0;
+      const int32_t kv_end_pos = seq_len;
+      int32_t local_split_id = 0;
+
+      AttentionWorkItemGroup curr_workitem(req_id, 0, 0, seq_len);
+      for (int32_t token_id = 0; token_id < q_token_num;
+           token_id += max_num_q_token_per_iter) {
+        const int32_t q_tile_token_num =
+            std::min(max_num_q_token_per_iter, q_token_num - token_id);
+        const int32_t q_tile_pos_left = q_start_pos + token_id;
+        const int32_t q_tile_pos_right = q_tile_pos_left + q_tile_token_num;
+        const auto [kv_tile_pos_left, kv_tile_pos_right] = calcu_kv_tile_pos(
+            kv_start_pos, kv_end_pos, q_tile_pos_left, q_tile_pos_right,
+            left_sliding_window_size, right_sliding_window_size);
+        const auto [aligned_kv_tile_pos_left, aligned_kv_tile_pos_right] =
+            align_kv_tile_pos(kv_tile_pos_left, kv_tile_pos_right,
+                              kv_len_alignment);
+        int32_t curr_kv_len =
+            aligned_kv_tile_pos_right - aligned_kv_tile_pos_left;
+        int32_t kv_token_pos_start = aligned_kv_tile_pos_left;
+
+        while (curr_kv_len > 0) {
+          if (curr_kv_len <= (remaining_kv_len + min_split_kv_len) ||
+              curr_thread_id == (thread_num - 1)) {
+            curr_workitem.q_token_num += q_tile_token_num;
+            curr_workitem.total_kv_len += curr_kv_len;
+            remaining_kv_len -= curr_kv_len;
+            curr_kv_len = 0;
+
+            if (remaining_kv_len < 0) {
+              // stop to accept more workitems
+              remaining_kv_len -= min_split_kv_len;
+            }
+
+            if (curr_workitem.kv_split_pos_start != 0) {
+              // got a partial kv spilt, need to create a single workitem
+              curr_workitem.split_id = cum_split_num;
+              curr_workitem.local_split_id = local_split_id;
+              workitems.emplace_back(curr_workitem);
+              ++workitem_num_per_thread[curr_thread_id];
+              ++reduce_workitems.back().split_num;
+              ++cum_split_num;
+
+              curr_workitem = AttentionWorkItemGroup(
+                  req_id, token_id + max_num_q_token_per_iter, 0, seq_len);
+            }
+
+            break;
+          }
+
+          if (remaining_kv_len < min_split_kv_len &&
+              (curr_workitem.total_kv_len > 0 ||
+               workitem_num_per_thread[curr_thread_id] > 0)) {
+            // remaining_kv_len is too short, and have allocated workitems, just
+            // leave to next thread
+            if (curr_workitem.total_kv_len > 0) {
+              workitems.emplace_back(curr_workitem);
+              ++workitem_num_per_thread[curr_thread_id];
+              curr_workitem =
+                  AttentionWorkItemGroup(req_id, token_id, 0, seq_len);
+            }
+
+            // switch to next thread
+            ++curr_thread_id;
+            remaining_kv_len = kv_len_per_thread;
+
+            // retry this iteration
+            continue;
+          }
+
+          // only split tail splits with q_tile_token_num <=
+          // split_kv_q_token_num_threshold
+          if (token_id + max_num_q_token_per_iter < q_token_num ||
+              q_tile_token_num > split_kv_q_token_num_threshold) {
+            // if requires a new q tile iteration and already has workitems,
+            // leave this workitem to next thread
+            if (curr_workitem.q_token_num % default_tile_token_num == 0 &&
+                (curr_workitem.total_kv_len > 0 ||
+                 workitem_num_per_thread[curr_thread_id] > 0)) {
+              if (curr_workitem.total_kv_len > 0) {
+                workitems.emplace_back(curr_workitem);
+                ++workitem_num_per_thread[curr_thread_id];
+              }
+              curr_workitem =
+                  AttentionWorkItemGroup(req_id, token_id, 0, seq_len);
+
+              // switch to next thread
+              ++curr_thread_id;
+              remaining_kv_len = kv_len_per_thread;
+            }
+
+            curr_workitem.q_token_num += q_tile_token_num;
+            curr_workitem.total_kv_len += curr_kv_len;
+            remaining_kv_len -= curr_kv_len;
+            curr_kv_len = 0;
+            break;
+          }
+
+          // split kv
+          if (curr_workitem.total_kv_len > 0) {
+            // write back curr workitem
+            workitems.emplace_back(curr_workitem);
+            ++workitem_num_per_thread[curr_thread_id];
+          }
+
+          if (kv_token_pos_start == aligned_kv_tile_pos_left) {
+            // first split, init the workitem
+            reduce_workitems.emplace_back(ReductionWorkItemGroup(
+                req_id, token_id, q_tile_token_num, cum_split_num));
+          }
+
+          int32_t spilt_size =
+              std::min(std::max(remaining_kv_len, (int64_t)min_split_kv_len),
+                       (int64_t)curr_kv_len);
+          curr_workitem =
+              AttentionWorkItemGroup(req_id, token_id, kv_token_pos_start,
+                                     kv_token_pos_start + spilt_size);
+          curr_workitem.q_token_num += q_tile_token_num;
+          curr_workitem.total_kv_len += spilt_size;
+          curr_workitem.split_id = cum_split_num;
+          curr_workitem.local_split_id = local_split_id;
+          workitems.emplace_back(curr_workitem);
+          ++workitem_num_per_thread[curr_thread_id];
+          ++reduce_workitems.back().split_num;
+          ++cum_split_num;
+          ++local_split_id;
+
+          kv_token_pos_start += spilt_size;
+          curr_kv_len -= spilt_size;
+          curr_workitem = AttentionWorkItemGroup(req_id, token_id,
+                                                 kv_token_pos_start, seq_len);
+
+          // switch to next thread
+          ++curr_thread_id;
+          remaining_kv_len = kv_len_per_thread;
+        }
+      }
+
+      if (curr_workitem.total_kv_len > 0) {
+        // write back curr workitem
+        workitems.emplace_back(curr_workitem);
+        ++workitem_num_per_thread[curr_thread_id];
+      }
+    }
+
+    int64_t metadata_tensor_size =
+        sizeof(AttentionMetadata) +
+        workitems.size() * sizeof(AttentionWorkItemGroup) +
+        reduce_workitems.size() * sizeof(ReductionWorkItemGroup);
+    auto options =
+        torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+    torch::Tensor metadata_tensor =
+        torch::empty({metadata_tensor_size}, options);
+    AttentionMetadata* metadata_ptr = new (metadata_tensor.data_ptr())
+        AttentionMetadata(input.isa, workitems.size(), reduce_workitems.size(),
+                          cum_split_num, split_kv_q_token_num_threshold);
+    AttentionWorkItemGroup* workitem_groups_ptr =
+        metadata_ptr->workitem_groups_ptr;
+    ReductionWorkItemGroup* reduction_items_ptr =
+        metadata_ptr->reduction_items_ptr;
+    std::memcpy(workitem_groups_ptr, workitems.data(),
+                workitems.size() * sizeof(AttentionWorkItemGroup));
+    std::memcpy(reduction_items_ptr, reduce_workitems.data(),
+                reduce_workitems.size() * sizeof(ReductionWorkItemGroup));
+
+    int32_t effective_thread_num = 0;
+    for (; effective_thread_num < thread_num; ++effective_thread_num) {
+      if (workitem_num_per_thread[effective_thread_num] == 0) {
+        break;
+      }
+    }
+
+    std::memcpy(metadata_ptr->cu_workitem_num_per_thread + 1,
+                workitem_num_per_thread.data(),
+                workitem_num_per_thread.size() * sizeof(int32_t));
+    for (int32_t i = 1; i <= thread_num; ++i) {
+      metadata_ptr->cu_workitem_num_per_thread[i] +=
+          metadata_ptr->cu_workitem_num_per_thread[i - 1];
+    }
+    metadata_ptr->effective_thread_num = effective_thread_num;
+
+    {
+      // when q_tile_size = max_num_q_per_iter, requires max
+      // attention_scratchpad_size
+      AttentionScratchPad sc(0, *metadata_ptr, 0x0);
+      int64_t n = AttentionScheduler::calcu_tile_size_with_constant_q(
+          cache_size, input.head_dim, input.elem_size, input.q_buffer_elem_size,
+          input.logits_buffer_elem_size, input.output_buffer_elem_size,
+          max_num_q_per_iter, kv_len_alignment, max_num_q_per_iter, true);
+      sc.update(input.head_dim, input.q_buffer_elem_size,
+                input.logits_buffer_elem_size, input.output_buffer_elem_size,
+                max_num_q_per_iter, max_num_q_per_iter, n);
+      metadata_ptr->attention_scratchpad_size_per_thread =
+          ((sc.get_thread_scratchpad_size() + 63) / 64) * 64;
+
+      sc.update(0, metadata_ptr->reduction_split_num, input.head_dim,
+                q_head_per_kv * split_kv_q_token_num_threshold,
+                input.output_buffer_elem_size);
+      metadata_ptr->reduction_scratchpad_size_per_kv_head =
+          ((sc.get_reduction_scratchpad_size() + 63) / 64) * 64;
+    }
+    int64_t scratchpad_size =
+        metadata_ptr->attention_scratchpad_size_per_thread *
+            metadata_ptr->thread_num +
+        metadata_ptr->reduction_scratchpad_size_per_kv_head *
+            (use_gqa ? input.num_heads_kv : input.num_heads_q);
+    DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
+        scratchpad_size);
+
+    // metadata_ptr->print();
+
+    // test out of boundary access
+    // {
+    //     float* cache_ptr =
+    //     DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
+    //     for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
+    //         cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
+    //     }
+    // }
+
+    return metadata_tensor;
+  }
+
+  FORCE_INLINE static std::pair<int32_t, int32_t> calcu_kv_tile_pos(
+      int32_t kv_left_pos, int32_t kv_right_pos, int32_t q_left_pos,
+      int32_t q_right_pos, int32_t sliding_window_left,
+      int32_t sliding_window_right) {
+    if (sliding_window_left != -1) {
+      kv_left_pos = std::max(kv_left_pos, q_left_pos - sliding_window_left);
+    }
+    if (sliding_window_right != -1) {
+      kv_right_pos = std::min(kv_right_pos, q_right_pos + sliding_window_right);
+    }
+    return {kv_left_pos, kv_right_pos};
+  }
+
+  FORCE_INLINE static std::pair<int32_t, int32_t> align_kv_tile_pos(
+      int32_t kv_left_pos, int32_t kv_right_pos, int32_t align_factor) {
+    kv_left_pos = (kv_left_pos / align_factor) * align_factor;
+    kv_right_pos =
+        ((kv_right_pos + align_factor - 1) / align_factor) * align_factor;
+    return {kv_left_pos, kv_right_pos};
+  }
+
+  static int64_t calcu_default_tile_size(int64_t cache_size, int64_t head_dim,
+                                         int64_t elem_size,
+                                         int64_t q_buffer_elem_size,
+                                         int64_t logits_buffer_elem_size,
+                                         int64_t output_buffer_elem_size,
+                                         int64_t max_num_q_per_iter,
+                                         int64_t round_size) {
+    // For CPU, different from CUDA, Q@K^T results should also be hold in cache,
+    // using float32. Intermediate outputs should be float32 to be compatible
+    // with AMX Then the cache includes:
+    //  - Q: q_tile_size * head_dim * q_buffer_elem_size
+    //  - K, V: 2 * k_tile_size * head_dim * elem_size
+    //  - Q@K^T: max_num_q_per_iter * k_tile_size * logits_buffer_elem_size
+    //  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size
+    // By default, let tile_size = q_tile_size = k_tile_size. To record
+    // is_first_iter states in a static array, require the default tile <= 128 *
+    // max_num_q_per_iter
+
+    int64_t tile_size =
+        cache_size / (head_dim * (q_buffer_elem_size + 2 * elem_size +
+                                  output_buffer_elem_size) +
+                      max_num_q_per_iter * logits_buffer_elem_size);
+    tile_size = std::min(tile_size, MaxQTileIterNum * max_num_q_per_iter);
+    int64_t rounded_tile_size = (tile_size / round_size) * round_size;
+    return std::max(rounded_tile_size, round_size);
+  }
+
+  static int64_t calcu_tile_size_with_constant_q(
+      int64_t cache_size, int64_t head_dim, int64_t elem_size,
+      int64_t q_buffer_elem_size, int64_t logits_buffer_elem_size,
+      int64_t output_buffer_elem_size, int64_t max_num_q_per_iter,
+      int64_t round_size, int64_t q_tile_size, bool one_round) {
+    // calculate tile_size with known q_tile_size
+    // If one_round is True, the outer Q tile loop time is 1, then the K,V will
+    // not be included in the cache
+    int64_t tile_size;
+    if (one_round) {
+      tile_size =
+          (cache_size - q_tile_size * head_dim *
+                            (q_buffer_elem_size + output_buffer_elem_size)) /
+          (logits_buffer_elem_size * max_num_q_per_iter);
+    } else {
+      tile_size =
+          (cache_size - q_tile_size * head_dim *
+                            (q_buffer_elem_size + output_buffer_elem_size)) /
+          (logits_buffer_elem_size * max_num_q_per_iter +
+           2 * head_dim * elem_size);
+    }
+    int64_t rounded_tile_size = (tile_size / round_size) * round_size;
+    return std::max(rounded_tile_size, round_size);
+  }
+
+  static int64_t get_available_l2_size() {
+    static int64_t size = []() {
+#if defined(__APPLE__)
+      // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
+      int64_t l2_cache_size = 0;
+      size_t len = sizeof(l2_cache_size);
+      if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
+          l2_cache_size > 0) {
+        return l2_cache_size >> 1;  // use 50% of L2 cache
+      }
+      // Fallback if sysctlbyname fails
+      return 128LL * 1024 >> 1;  // use 50% of 128KB
+#else
+      long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
+      TORCH_CHECK_NE(l2_cache_size, -1);
+      return l2_cache_size >> 1;  // use 50% of L2 cache
+#endif
+    }();
+    return size;
+  }
+
+ private:
+  int64_t available_cache_size_;
+};
+
+struct AttentionInput {
+  AttentionMetadata* metadata;
+  int32_t num_tokens;
+  int32_t num_heads;
+  int32_t num_kv_heads;
+  int32_t block_size;
+  void* query;
+  int64_t query_num_tokens_stride;
+  int64_t query_num_heads_stride;
+  int64_t cache_num_blocks_stride;
+  int64_t cache_num_kv_heads_stride;
+  int64_t blt_num_tokens_stride;
+  void* key_cache;
+  void* value_cache;
+  void* output;
+  int32_t* query_start_loc;
+  int32_t* seq_lens;
+  int32_t* block_table;
+  float* alibi_slopes;
+  c10::BFloat16* s_aux;
+  float scale;
+  bool causal;
+  int32_t sliding_window_left;
+  int32_t sliding_window_right;
+  float softcap;
+};
+
+#define DEFINE_CPU_ATTENTION_PARAMS                                         \
+  q_buffer_t *__restrict__ q_heads_buffer,                                  \
+      kv_cache_t *__restrict__ k_head_cache_ptr,                            \
+      kv_cache_t *__restrict__ v_head_cache_ptr,                            \
+      logits_buffer_t *__restrict__ logits_buffer,                          \
+      float *__restrict__ partial_q_buffer, float *__restrict__ max_buffer, \
+      float *__restrict__ sum_buffer, int32_t *__restrict__ block_table,    \
+      const int32_t kv_tile_start_pos, const int32_t kv_tile_end_pos,       \
+      const int32_t kv_tile_token_num,                                      \
+      const int64_t kv_cache_num_blocks_stride, const int32_t q_head_num,   \
+      const int32_t q_token_num, const int32_t q_tile_start_pos,            \
+      const int32_t q_heads_per_kv, const int32_t block_size,               \
+      const int32_t left_window_size, const int32_t right_window_size,      \
+      float scale, const float softcap_scale,                               \
+      const float *__restrict__ alibi_slopes, const bool is_first_iter,     \
+      const bool use_sink, const bool debug_info
+
+#define CPU_ATTENTION_PARAMS                                                  \
+  q_heads_buffer, k_head_cache_ptr, v_head_cache_ptr, logits_buffer,          \
+      partial_q_buffer, max_buffer, sum_buffer, block_table,                  \
+      kv_tile_start_pos, kv_tile_end_pos, kv_tile_token_num,                  \
+      kv_cache_num_blocks_stride, q_head_num, q_token_num, q_tile_start_pos,  \
+      q_heads_per_kv, block_size, left_window_size, right_window_size, scale, \
+      softcap_scale, alibi_slopes, is_first_iter, use_sink, debug_info
+
+enum class AttentionGemmPhase { QK, PV };
+
+template <typename T>
+struct VecTypeTrait {
+  using vec_t = void;
+};
+
+template <>
+struct VecTypeTrait<float> {
+  using vec_t = vec_op::FP32Vec16;
+};
+
+// ARM only supports BF16 with ARMv8.6-A extension
+#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
+#else
+template <>
+struct VecTypeTrait<c10::BFloat16> {
+  using vec_t = vec_op::BF16Vec16;
+};
+#endif
+
+#if !defined(__powerpc__)
+template <>
+struct VecTypeTrait<c10::Half> {
+  using vec_t = vec_op::FP16Vec16;
+};
+#endif
+
+template <typename T>
+void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
+                  int32_t stride) {
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(5) << name << ": [\n";
+  auto* curr_logits_buffer = ptr;
+  for (int32_t m = 0; m < row; ++m) {
+    for (int32_t n = 0; n < col; ++n) {
+      ss << curr_logits_buffer[n] << ", ";
+    }
+    ss << "\n";
+    curr_logits_buffer += stride;
+  }
+  ss << "]\n";
+  std::printf("%s", ss.str().c_str());
+}
+
+template <typename attention_impl_t>
+class AttentionMainLoop {
+ public:
+  using query_t = typename attention_impl_t::query_t;
+  using q_buffer_t = typename attention_impl_t::q_buffer_t;
+  using kv_cache_t = typename attention_impl_t::kv_cache_t;
+  using logits_buffer_t = typename attention_impl_t::logits_buffer_t;
+  using partial_output_buffer_t =
+      typename attention_impl_t::partial_output_buffer_t;
+  using prob_buffer_t = typename attention_impl_t::prob_buffer_t;
+
+  static constexpr int64_t max_q_head_num_per_iter =
+      attention_impl_t::MaxQHeadNumPerIteration;
+  static constexpr int64_t blocksize_alignment =
+      attention_impl_t::BlockSizeAlignment;
+  static constexpr int64_t headdim_alignment =
+      attention_impl_t::HeadDimAlignment;
+  static constexpr int64_t head_dim = attention_impl_t::HeadDim;
+  static constexpr ISA ISAType = attention_impl_t::ISAType;
+  static constexpr bool scale_on_logits =
+      attention_impl_t::scale_on_logits;  // apply scale on logits, otherwise
+                                          // apply scale on q_buffer
+
+  template <typename tile_gemm_t>
+  class Attention {
+   public:
+    // Args:
+    //  - q_heads_buffer: [MaxQHeadNumPerIteration, head_dim]
+    //  - k_head_cache_ptr: [num_blocks, block_size * head_dim]
+    //  - v_head_cache_ptr: [num_blocks, block_size * head_dim]
+    //  - logits_buffer: [MaxQHeadNumPerIteration, kv_tile_token_num], store Q@K
+    //  - logits partial_q_buffer: [MaxQHeadNumPerIteration, head_dim], store
+    //  partial output
+    //  - max_buffer: [MaxQHeadNumPerIteration, 1], store max logits
+    //  - sum_buffer: [MaxQHeadNumPerIteration, 1], store sum of exp
+    //  - block_table
+    //  - kv_tile_start_pos: start position of KV cache, aligned to
+    //  BlockSizeAlignment
+    //  - kv_tile_end_pos: end position of KV cache, aligned to
+    //  BlockSizeAlignment
+    //  - kv_tile_token_num: KV token num, aligned to BlockSizeAlignment
+    //  - kv_cache_num_blocks_stride
+    //  - q_head_num: head num of q_tile
+    //  - q_token_num: token num of q_tile, should be q_head_num /
+    //  q_heads_per_kv
+    //  - q_tile_start_pos: start pos of the first token in q_heads_buffer
+    //  - q_heads_per_kv
+    //  - block_size
+    //  - left_window_size
+    //  - right_window_size
+    //  - scale
+    //  - softcap_scale
+    //  - alibi_slopes
+    //  - is_first_iter
+    //  - use_sink
+    //  - debug_info
+    void operator()(DEFINE_CPU_ATTENTION_PARAMS) {
+      // k_cache_token_group_stride: stride of K cache when move to next
+      // BlockSizeAlignment tokens in a block
+      const int64_t k_cache_token_group_stride =
+          attention_impl_t::k_cache_token_group_stride(block_size);
+      // v_cache_token_group_stride: stride of V cache when move to next
+      // BlockSizeAlignment tokens in a block
+      const int64_t v_cache_token_group_stride =
+          attention_impl_t::v_cache_token_group_stride(block_size);
+      // v_cache_head_group_stride: stride of V cache when move to next
+      // HeadDimAlignment head dims in a block
+      const int64_t v_cache_head_group_stride =
+          attention_impl_t::v_cache_head_group_stride(block_size);
+      const int32_t token_group_num = kv_tile_token_num / blocksize_alignment;
+      const int32_t token_group_num_per_block =
+          block_size / blocksize_alignment;
+      const int32_t start_block_idx = kv_tile_start_pos / block_size;
+      const int32_t start_block_offset = kv_tile_start_pos % block_size;
+      const int32_t start_block_group_offset =
+          start_block_offset / blocksize_alignment;
+      const int32_t end_block_idx =
+          (kv_tile_start_pos + kv_tile_token_num - 1) / block_size + 1;
+
+      // compute Q@K logits
+      {
+        int32_t curr_group_offset =
+            start_block_group_offset * k_cache_token_group_stride;
+        int32_t curr_group_num_in_block =
+            token_group_num_per_block - start_block_group_offset;
+        int32_t remaining_group_num = token_group_num;
+        logits_buffer_t* curr_logits_buffer = logits_buffer;
+        for (int32_t block_idx = start_block_idx; block_idx < end_block_idx;
+             ++block_idx) {
+          int32_t physical_block_idx = block_table[block_idx];
+          kv_cache_t* k_cache_block_ptr =
+              k_head_cache_ptr +
+              physical_block_idx * kv_cache_num_blocks_stride +
+              curr_group_offset;
+          curr_group_num_in_block =
+              std::min(remaining_group_num, curr_group_num_in_block);
+
+          for (int32_t block_group_idx = 0;
+               block_group_idx < curr_group_num_in_block; ++block_group_idx) {
+            // logits_tile = q_tile @ k_tile, [MaxQHeadNumPerIteration,
+            // BlockSizeAlignment] = [MaxQHeadNumPerIteration, head_dim] @
+            // [head_dim, BlockSizeAlignment]
+
+            // By default, logits_buffer, q_buffer and k_cache are row-major,
+            // but may be packed by ISA implementation.
+            tile_gemm_t::template gemm<AttentionGemmPhase::QK, head_dim>(
+                q_head_num, q_heads_buffer, k_cache_block_ptr,
+                curr_logits_buffer, head_dim, block_size, kv_tile_token_num,
+                block_size, head_dim, false);
+
+            if constexpr (scale_on_logits) {
+              float* __restrict__ scale_curr_logits_buffer = curr_logits_buffer;
+              vec_op::FP32Vec16 scale_vec(scale);
+              for (int32_t i = 0; i < q_head_num; ++i) {
+                static_assert(blocksize_alignment % 16 == 0);
+                constexpr int32_t vec_num = blocksize_alignment / 16;
+                vec_op::unroll_loop<int32_t, vec_num>([&](int32_t vec_idx) {
+                  vec_op::FP32Vec16 vec(scale_curr_logits_buffer +
+                                        vec_idx * 16);
+                  vec = vec * scale_vec;
+                  vec.save(scale_curr_logits_buffer + vec_idx * 16);
+                });
+                scale_curr_logits_buffer += kv_tile_token_num;
+              }
+            }
+
+            // Move buffer ptrs
+            k_cache_block_ptr += k_cache_token_group_stride;
+            curr_logits_buffer += blocksize_alignment;
+          }
+
+          // Update
+          remaining_group_num -= curr_group_num_in_block;
+          curr_group_offset = 0;
+          curr_group_num_in_block = token_group_num_per_block;
+        }
+      }
+
+      // process logits
+      {
+        // if (debug_info){
+        //     print_logits("raw logits", logits_buffer, q_head_num,
+        //     kv_tile_token_num, kv_tile_token_num);
+        // }
+
+        if (softcap_scale != 0.0f) {
+          apply_softcap(logits_buffer, kv_tile_token_num, q_head_num,
+                        kv_tile_token_num, softcap_scale);
+          // print_logits("softcap raw logits", logits_buffer, q_head_num,
+          // kv_tile_token_num, kv_tile_token_num);
+        }
+
+        if (alibi_slopes != nullptr) {
+          apply_alibi_slopes(logits_buffer, alibi_slopes, kv_tile_token_num,
+                             q_tile_start_pos, kv_tile_start_pos, q_token_num,
+                             kv_tile_token_num, q_heads_per_kv);
+
+          // print_logits("alibi raw logits", logits_buffer, q_head_num,
+          // kv_tile_token_num, kv_tile_token_num);
+        }
+
+        apply_mask(logits_buffer, kv_tile_token_num, q_tile_start_pos,
+                   kv_tile_start_pos, kv_tile_end_pos, q_token_num,
+                   q_heads_per_kv, left_window_size, right_window_size);
+
+        // if (debug_info){
+        // print_logits("masked logits", logits_buffer, q_head_num,
+        // kv_tile_token_num, kv_tile_token_num);
+        // print_logits("old_max", max_buffer, 1, q_head_num, q_head_num);
+        // print_logits("old_sum", sum_buffer, 1, q_head_num, q_head_num);
+        // }
+
+        apply_softmax(logits_buffer, partial_q_buffer, max_buffer, sum_buffer,
+                      kv_tile_token_num, q_head_num, kv_tile_token_num,
+                      is_first_iter, use_sink);
+
+        // if (debug_info){
+        //     print_logits("softmax logits",
+        //     reinterpret_cast<prob_buffer_t*>(logits_buffer), q_head_num,
+        //     kv_tile_token_num, kv_tile_token_num * sizeof(logits_buffer_t) /
+        //     sizeof(prob_buffer_t));
+        //     print_logits("new_max", max_buffer, 1, q_head_num, q_head_num);
+        //     print_logits("new_sum", sum_buffer, 1, q_head_num, q_head_num);
+        // }
+      }
+
+      // compute P@V
+      {
+        int32_t curr_group_offset =
+            start_block_group_offset * v_cache_token_group_stride;
+        int32_t curr_group_num_in_block =
+            token_group_num_per_block - start_block_group_offset;
+        int32_t remaining_group_num = token_group_num;
+        int32_t head_dim_group_num = head_dim / headdim_alignment;
+        prob_buffer_t* curr_prob_buffer =
+            reinterpret_cast<prob_buffer_t*>(logits_buffer);
+        int64_t prob_buffer_stride =
+            kv_tile_token_num *
+            (sizeof(logits_buffer_t) / sizeof(prob_buffer_t));
+        partial_output_buffer_t* curr_partial_q_buffer = partial_q_buffer;
+        bool accum_c = !is_first_iter;
+        for (int32_t block_idx = start_block_idx; block_idx < end_block_idx;
+             ++block_idx) {
+          int32_t physical_block_idx = block_table[block_idx];
+          kv_cache_t* v_cache_block_ptr =
+              v_head_cache_ptr +
+              physical_block_idx * kv_cache_num_blocks_stride +
+              curr_group_offset;
+          curr_group_num_in_block =
+              std::min(remaining_group_num, curr_group_num_in_block);
+          int32_t curr_token_num =
+              curr_group_num_in_block * blocksize_alignment;
+
+          for (int32_t head_dim_group_idx = 0;
+               head_dim_group_idx < head_dim_group_num; ++head_dim_group_idx) {
+            // output_tile = p_tile @ v_tile, [MaxQHeadNumPerIteration,
+            // HeadDimAlignment] = [MaxQHeadNumPerIteration, block_size] @
+            // [block_size, HeadDimAlignment]
+            tile_gemm_t::template gemm<AttentionGemmPhase::PV, -1>(
+                q_head_num, curr_prob_buffer, v_cache_block_ptr,
+                curr_partial_q_buffer, prob_buffer_stride, head_dim, head_dim,
+                block_size, curr_token_num, accum_c);
+
+            // Update
+            curr_partial_q_buffer += headdim_alignment;
+            v_cache_block_ptr += v_cache_head_group_stride;
+          }
+
+          // Update
+          remaining_group_num -= curr_group_num_in_block;
+          curr_group_offset = 0;
+          curr_group_num_in_block = token_group_num_per_block;
+          curr_prob_buffer += curr_token_num;
+          curr_partial_q_buffer = partial_q_buffer;
+          accum_c = true;
+        }
+      }
+      //   if (debug_info) {
+      //     print_logits("output", partial_q_buffer, q_head_num, head_dim,
+      //     head_dim);
+      //   }
+    }
+
+    void apply_mask(logits_buffer_t* __restrict__ logits_buffer,
+                    const int64_t logits_buffer_stride,
+                    const int32_t q_tile_start_pos,
+                    const int32_t kv_tile_start_pos,
+                    const int32_t kv_tile_end_pos, const int32_t q_token_num,
+                    const int32_t q_heads_per_kv,
+                    const int32_t sliding_window_left,
+                    const int32_t sliding_window_right) {
+      // Apply mask
+      constexpr logits_buffer_t neg_inf =
+          -std::numeric_limits<logits_buffer_t>::infinity();
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      int32_t curr_token_pos = q_tile_start_pos;
+      for (int32_t token_idx = 0; token_idx < q_token_num; ++token_idx) {
+        int32_t left_kv_pos = [&]() {
+          int32_t pos = kv_tile_start_pos;
+          if (sliding_window_left != -1) {
+            pos = std::max(pos, curr_token_pos - sliding_window_left);
+          }
+          return pos;
+        }();
+
+        int32_t right_kv_pos = [&]() {
+          int32_t pos = kv_tile_end_pos;
+          if (sliding_window_right != -1) {
+            pos = std::min(pos,
+                           std::max(kv_tile_start_pos,
+                                    curr_token_pos + sliding_window_right + 1));
+          }
+          return pos;
+        }();
+
+        int32_t left_invalid_token_num = left_kv_pos - kv_tile_start_pos;
+        int32_t right_invalid_token_num = kv_tile_end_pos - right_kv_pos;
+        for (int32_t head_idx = 0; head_idx < q_heads_per_kv; ++head_idx) {
+          logits_buffer_t* __restrict__ curr_logits_buffer_tail =
+              curr_logits_buffer + right_kv_pos - kv_tile_start_pos;
+          for (int32_t i = 0; i < left_invalid_token_num; ++i) {
+            curr_logits_buffer[i] = neg_inf;
+          }
+          for (int32_t i = 0; i < right_invalid_token_num; ++i) {
+            curr_logits_buffer_tail[i] = neg_inf;
+          }
+
+          curr_logits_buffer += logits_buffer_stride;
+        }
+
+        ++curr_token_pos;
+      }
+    }
+
+    void apply_softmax(logits_buffer_t* __restrict__ logits_buffer,
+                       float* __restrict__ partial_q_buffer,
+                       float* __restrict__ max_buffer,
+                       float* __restrict__ sum_buffer,
+                       const int64_t logits_buffer_stride, int32_t q_head_num,
+                       int32_t kv_tile_token_num, bool is_first_iter,
+                       bool use_sink) {
+#ifdef DEFINE_FAST_EXP
+      DEFINE_FAST_EXP
+#endif
+      using prob_buffer_vec_t = typename VecTypeTrait<prob_buffer_t>::vec_t;
+      static_assert(sizeof(prob_buffer_t) <= sizeof(logits_buffer_t));
+
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      float* __restrict__ curr_partial_q_buffer = partial_q_buffer;
+      const int32_t vec_num = kv_tile_token_num / 16;
+      const int32_t head_vec_num = head_dim / 16;
+      for (int32_t i = 0; i < q_head_num; ++i) {
+        float init_max_val = max_buffer[i];
+        float init_sum_val = sum_buffer[i];
+
+        // apply scale and compute max
+        vec_op::FP32Vec16 max_vec(init_max_val);
+        {
+          logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+              curr_logits_buffer;
+          for (int32_t j = 0; j < vec_num; ++j) {
+            vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+            max_vec = vec.max(max_vec);
+
+            curr_logits_buffer_iter += 16;
+          }
+        }
+        float new_max_val = max_vec.reduce_max();
+        float rescale_factor = init_max_val - new_max_val;
+
+        // use same rescale threshold with FA4.
+        // https://github.com/Dao-AILab/flash-attention/blob/1b8e1e641c6a179be9a0538b7f40fd595050b735/flash_attn/cute/flash_fwd_sm100.py#L1271
+        bool need_rescale = rescale_factor < -8.0;
+        if (!need_rescale) {
+          new_max_val = init_max_val;
+        } else {
+          max_buffer[i] = new_max_val;
+        }
+
+        // sub max, compute exp and sum
+        max_vec = vec_op::FP32Vec16(new_max_val);
+        vec_op::FP32Vec16 sum_vec(0.0);
+        {
+          logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+              curr_logits_buffer;
+          prob_buffer_t* __restrict__ curr_prob_buffer_iter =
+              reinterpret_cast<prob_buffer_t*>(curr_logits_buffer);
+          for (int32_t j = 0; j < vec_num; ++j) {
+            vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+            vec = vec - max_vec;
+
+            // compute exp
+#ifdef DEFINE_FAST_EXP
+            vec = fast_exp(vec);
+            prob_buffer_vec_t output_vec(vec);
+            output_vec.save(curr_prob_buffer_iter);
+#else
+            vec.save(curr_logits_buffer_iter);
+            for (int32_t k = 0; k < 16; ++k) {
+              curr_logits_buffer_iter[k] = std::exp(curr_logits_buffer_iter[k]);
+            }
+            vec = vec_op::FP32Vec16(curr_logits_buffer_iter);
+#endif
+
+            sum_vec = sum_vec + vec;
+
+            curr_logits_buffer_iter += 16;
+            curr_prob_buffer_iter += 16;
+          }
+        }
+        float new_sum_val = sum_vec.reduce_sum();
+
+        // rescale sum and partial outputs
+        if (need_rescale) {
+          // compute rescale factor
+#ifdef DEFINE_FAST_EXP
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+          rescale_factor_vec = fast_exp(rescale_factor_vec);
+          rescale_factor = rescale_factor_vec.get_last_elem();
+#else
+          rescale_factor = std::exp(rescale_factor);
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+#endif
+
+          // rescale sum
+          new_sum_val += rescale_factor * init_sum_val;
+
+          // rescale output
+          if (!is_first_iter) {
+            float* __restrict__ curr_partial_q_buffer_iter =
+                curr_partial_q_buffer;
+            for (int32_t j = 0; j < head_vec_num; ++j) {
+              vec_op::FP32Vec16 vec(curr_partial_q_buffer_iter);
+              vec = vec * rescale_factor_vec;
+              vec.save(curr_partial_q_buffer_iter);
+
+              curr_partial_q_buffer_iter += 16;
+            }
+          }
+        } else {
+          new_sum_val += init_sum_val;
+        }
+
+        sum_buffer[i] = new_sum_val;
+
+        curr_logits_buffer += logits_buffer_stride;
+        curr_partial_q_buffer += head_dim;
+      }
+    }
+
+    void apply_softcap(logits_buffer_t* __restrict__ logits_buffer,
+                       const int64_t logits_buffer_stride, int32_t q_head_num,
+                       int32_t kv_tile_token_num, float softcap_scale) {
+#ifdef DEFINE_FAST_EXP
+      DEFINE_FAST_EXP
+#endif
+      float inv_softcap_scale = 1.0 / softcap_scale;
+      vec_op::FP32Vec16 softcap_scale_vec(softcap_scale);
+      vec_op::FP32Vec16 inv_softcap_scale_vec(inv_softcap_scale);
+      vec_op::FP32Vec16 ones_vec(1.0);
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      const int32_t vec_num = kv_tile_token_num / 16;
+      for (int32_t i = 0; i < q_head_num; ++i) {
+        logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+            curr_logits_buffer;
+        for (int32_t j = 0; j < vec_num; ++j) {
+          vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+          vec = vec * inv_softcap_scale_vec;
+
+#ifdef DEFINE_FAST_EXP
+          vec = fast_exp(vec);
+          vec_op::FP32Vec16 inv_vec = ones_vec / vec;
+          vec = (vec - inv_vec) / (vec + inv_vec);
+#else
+          vec.save(curr_logits_buffer_iter);
+          for (int k = 0; k < 16; ++k) {
+            curr_logits_buffer_iter[k] = std::tanh(curr_logits_buffer_iter[k]);
+          }
+          vec = vec_op::FP32Vec16(curr_logits_buffer_iter);
+#endif
+          vec = vec * softcap_scale_vec;
+          vec.save(curr_logits_buffer_iter);
+
+          curr_logits_buffer_iter += 16;
+        }
+
+        curr_logits_buffer += logits_buffer_stride;
+      }
+    }
+
+    void apply_alibi_slopes(logits_buffer_t* __restrict__ logits_buffer,
+                            const float* __restrict__ alibi_slopes,
+                            const int64_t logits_buffer_stride,
+                            const int32_t q_tile_start_pos,
+                            const int32_t kv_tile_start_pos,
+                            const int32_t q_token_num,
+                            const int32_t kv_tile_token_num,
+                            const int32_t q_heads_per_kv) {
+      alignas(64) constexpr float initial_arange_vals[16] = {
+          0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+          8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+      const int32_t vec_num = kv_tile_token_num / 16;
+
+      vec_op::FP32Vec16 initial_arange_vals_vec(initial_arange_vals);
+      initial_arange_vals_vec =
+          initial_arange_vals_vec + vec_op::FP32Vec16((float)kv_tile_start_pos);
+      vec_op::FP32Vec16 pos_offset_vec(16.0);
+      logits_buffer_t* __restrict__ curr_logits_buffer = logits_buffer;
+      for (int32_t i = 0; i < q_token_num; ++i) {
+        vec_op::FP32Vec16 curr_q_pos_vec((float)(i + q_tile_start_pos));
+        for (int32_t j = 0; j < q_heads_per_kv; ++j) {
+          vec_op::FP32Vec16 alibi_scale_vec(alibi_slopes[j]);
+          vec_op::FP32Vec16 curr_kv_pos_vec(initial_arange_vals_vec);
+          logits_buffer_t* __restrict__ curr_logits_buffer_iter =
+              curr_logits_buffer;
+          for (int32_t k = 0; k < vec_num; ++k) {
+            vec_op::FP32Vec16 alibi_bias_vec =
+                alibi_scale_vec * (curr_kv_pos_vec - curr_q_pos_vec);
+            vec_op::FP32Vec16 vec(curr_logits_buffer_iter);
+            vec = vec + alibi_bias_vec;
+
+            vec.save(curr_logits_buffer_iter);
+
+            curr_kv_pos_vec = curr_kv_pos_vec + pos_offset_vec;
+            curr_logits_buffer_iter += 16;
+          }
+          curr_logits_buffer += logits_buffer_stride;
+        }
+      }
+    }
+  };
+
+ public:
+  void operator()(const AttentionInput* input) {
+    const int thread_num = omp_get_max_threads();
+    TORCH_CHECK_EQ(input->metadata->thread_num, thread_num);
+    std::atomic<int32_t> guard_counter(0);
+    std::atomic<int32_t>* guard_counter_ptr = &guard_counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int thread_id = 0; thread_id < thread_num; ++thread_id) {
+      AttentionMetadata& metadata = *input->metadata;
+      if (metadata.workitem_group_num == 0) {
+        continue;
+      }
+
+      attention_impl_t attn_impl;
+
+      // general information
+      const int32_t q_head_num = input->num_heads;
+      const int32_t kv_head_num = input->num_kv_heads;
+      const int32_t q_heads_per_kv = q_head_num / kv_head_num;
+      const bool use_gqa =
+          (max_q_head_num_per_iter % q_heads_per_kv == 0) ? true : false;
+      const int32_t actual_kv_head_num = use_gqa ? kv_head_num : q_head_num;
+      const int32_t actual_q_heads_per_kv = use_gqa ? q_heads_per_kv : 1;
+      TORCH_CHECK_LE(actual_q_heads_per_kv, max_q_head_num_per_iter);
+      const int32_t max_q_token_num_per_iter =
+          max_q_head_num_per_iter / actual_q_heads_per_kv;
+      const int64_t q_token_num_stride = input->query_num_tokens_stride;
+      const int64_t q_head_num_stride = input->query_num_heads_stride;
+      const int64_t kv_cache_head_num_stride = input->cache_num_kv_heads_stride;
+      const int64_t kv_cache_block_num_stride = input->cache_num_blocks_stride;
+      const int32_t sliding_window_left = input->sliding_window_left;
+      const int32_t sliding_window_right = input->sliding_window_right;
+      const int32_t block_size = input->block_size;
+      const float scale = input->scale;
+      const float softcap_scale = input->softcap;
+      const float* alibi_slopes = input->alibi_slopes;
+      const c10::BFloat16* s_aux = input->s_aux;
+
+      const bool casual = input->causal;
+      int32_t* const block_table = input->block_table;
+      const int64_t block_table_stride = input->blt_num_tokens_stride;
+
+      // init buffers
+      void* scratchpad_ptr =
+          DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+              ->get_data<void>();
+      AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);
+
+      const int32_t total_reduction_split_num = metadata.reduction_split_num;
+      if (metadata.reduction_split_num > 0) {
+        // reset split flag
+        for (int32_t head_idx = thread_id; head_idx < actual_kv_head_num;
+             head_idx += thread_num) {
+          buffer_manager.update(head_idx, total_reduction_split_num, head_dim,
+                                0, sizeof(partial_output_buffer_t));
+          volatile bool* __restrict__ curr_flag_ptr =
+              buffer_manager.get_reduce_flag_buffer();
+          for (int32_t split_idx = 0; split_idx < total_reduction_split_num;
+               ++split_idx) {
+            curr_flag_ptr[split_idx] = false;
+          }
+        }
+      }
+
+      const int64_t available_cache_size =
+          AttentionScheduler::get_available_l2_size();
+      const int32_t default_tile_size =
+          AttentionScheduler::calcu_default_tile_size(
+              available_cache_size, head_dim, sizeof(kv_cache_t),
+              sizeof(q_buffer_t), sizeof(logits_buffer_t),
+              sizeof(partial_output_buffer_t), max_q_head_num_per_iter,
+              max_q_head_num_per_iter);
+      const int32_t default_q_tile_token_num =
+          default_tile_size / actual_q_heads_per_kv;
+
+      AttentionWorkItemGroup* const workitem_groups =
+          metadata.workitem_groups_ptr;
+      const int32_t* cu_workitem_num_per_thread =
+          metadata.cu_workitem_num_per_thread;
+      ReductionWorkItemGroup* const reduction_items =
+          metadata.reduction_items_ptr;
+
+      const int32_t effective_thread_num = metadata.effective_thread_num;
+      const int32_t reduction_item_num = metadata.reduction_item_num;
+      const int32_t split_kv_q_token_num_threshold =
+          metadata.split_kv_q_token_num_threshold;
+      const int32_t workitem_groups_counter_num =
+          actual_kv_head_num * effective_thread_num;
+      const int32_t reduction_items_counter_num =
+          actual_kv_head_num * reduction_item_num;
+      const int32_t total_counter_num =
+          workitem_groups_counter_num + reduction_items_counter_num;
+
+      if (metadata.reduction_split_num > 0) {
+        ++(*guard_counter_ptr);
+        while (guard_counter_ptr->load() != thread_num) {
+#ifdef FAST_SPINNING
+          FAST_SPINNING
+#else
+          std::this_thread::yield();
+#endif
+        }
+      }
+
+      // main loop
+      for (;;) {
+        int64_t task_idx = metadata.acquire_counter();
+
+        if (task_idx >= total_counter_num) {
+          // no more tasks, leave loop
+          break;
+        }
+
+        if (task_idx < workitem_groups_counter_num) {
+          // attention task
+          // map task_idx to workitem_groups
+          const int32_t kv_head_idx = task_idx / effective_thread_num;
+          const int32_t thread_offset = task_idx % effective_thread_num;
+          AttentionWorkItemGroup* const curr_workitem_groups =
+              workitem_groups + cu_workitem_num_per_thread[thread_offset];
+          const int32_t curr_workitem_groups_num =
+              cu_workitem_num_per_thread[thread_offset + 1] -
+              cu_workitem_num_per_thread[thread_offset];
+
+          const int32_t q_head_start_idx = kv_head_idx * actual_q_heads_per_kv;
+
+          for (int32_t workitem_group_idx = 0;
+               workitem_group_idx < curr_workitem_groups_num;
+               ++workitem_group_idx) {
+            AttentionWorkItemGroup* const current_workitem_group =
+                &curr_workitem_groups[workitem_group_idx];
+
+            const int32_t current_group_idx = current_workitem_group->req_id;
+            const int32_t kv_start_pos =
+                current_workitem_group->kv_split_pos_start;
+            const int32_t kv_end_pos = current_workitem_group->kv_split_pos_end;
+            const int32_t curr_spilt_id = current_workitem_group->split_id;
+            const int32_t q_token_id_start =
+                current_workitem_group->q_token_id_start;
+            const int32_t q_token_num = current_workitem_group->q_token_num;
+
+            // taskgroup general information
+            const int32_t q_end = input->query_start_loc[current_group_idx + 1];
+            const int32_t q_start = input->query_start_loc[current_group_idx];
+            const int32_t seq_len = input->seq_lens[current_group_idx];
+            const int32_t q_start_pos =
+                (casual ? seq_len - (q_end - q_start) : 0);
+            const int32_t block_num = (seq_len + block_size - 1) / block_size;
+            // Only apply sink for the first KV split
+            bool use_sink = (s_aux != nullptr &&
+                             current_workitem_group->local_split_id == 0);
+
+            for (int32_t q_token_offset = 0; q_token_offset < q_token_num;
+                 q_token_offset += default_q_tile_token_num) {
+              bool first_iter_flag[AttentionScheduler::MaxQTileIterNum];
+              for (int32_t i = 0; i < AttentionScheduler::MaxQTileIterNum;
+                   ++i) {
+                first_iter_flag[i] = true;
+              }
+
+              const int32_t q_token_start_idx =
+                  q_start + q_token_offset + q_token_id_start;
+              const int32_t actual_q_token_num = std::min(
+                  default_q_tile_token_num, q_token_num - q_token_offset);
+              const int32_t q_head_tile_size =
+                  actual_q_token_num * actual_q_heads_per_kv;
+              const int32_t rounded_q_head_tile_size =
+                  ((q_head_tile_size + max_q_head_num_per_iter - 1) /
+                   max_q_head_num_per_iter) *
+                  max_q_head_num_per_iter;
+              const int32_t kv_tile_size =
+                  AttentionScheduler::calcu_tile_size_with_constant_q(
+                      available_cache_size, head_dim, sizeof(kv_cache_t),
+                      sizeof(q_buffer_t), sizeof(logits_buffer_t),
+                      sizeof(partial_output_buffer_t), max_q_head_num_per_iter,
+                      blocksize_alignment, rounded_q_head_tile_size,
+                      rounded_q_head_tile_size <= max_q_head_num_per_iter);
+
+              // update buffers
+              buffer_manager.update(
+                  head_dim, sizeof(q_buffer_t), sizeof(logits_buffer_t),
+                  sizeof(partial_output_buffer_t), max_q_head_num_per_iter,
+                  rounded_q_head_tile_size, kv_tile_size);
+              q_buffer_t* q_buffer = buffer_manager.get_q_buffer<q_buffer_t>();
+              float* logits_buffer = buffer_manager.get_logits_buffer();
+              float* partial_q_buffer = buffer_manager.get_output_buffer();
+              float* max_buffer = buffer_manager.get_max_buffer();
+              float* sum_buffer = buffer_manager.get_sum_buffer();
+
+              const int32_t q_tile_start_pos =
+                  q_start_pos + q_token_offset + q_token_id_start;
+              const int32_t q_tile_end_pos =
+                  q_tile_start_pos + actual_q_token_num;
+              const auto [kv_tile_start_pos, kv_tile_end_pos] =
+                  AttentionScheduler::calcu_kv_tile_pos(
+                      kv_start_pos, kv_end_pos, q_tile_start_pos,
+                      q_tile_end_pos, sliding_window_left,
+                      sliding_window_right);
+              const auto [rounded_kv_tile_start_pos, rounded_kv_tile_end_pos] =
+                  AttentionScheduler::align_kv_tile_pos(
+                      kv_tile_start_pos, kv_tile_end_pos, blocksize_alignment);
+
+              int32_t curr_kv_head_idx =
+                  use_gqa ? kv_head_idx
+                          : (kv_head_idx /
+                             q_heads_per_kv);  // for GQA disabled case
+
+              // std::printf("thread_id: %d, req_id: %d, q_token_start: %d,
+              // q_token_end: %d, q_head_start: %d, q_head_end: %d, kv_head_idx:
+              // %d, kv_pos_start: %d, kv_pos_end: %d\n",
+              //                 thread_id, current_group_idx,
+              //                 q_token_start_idx, q_token_start_idx +
+              //                 actual_q_token_num, q_head_start_idx,
+              //                 q_head_start_idx + actual_q_heads_per_kv,
+              //                 curr_kv_head_idx, kv_tile_start_pos,
+              //                 kv_tile_end_pos);
+
+              // move buffers
+              kv_cache_t* curr_k_cache =
+                  reinterpret_cast<kv_cache_t*>(input->key_cache) +
+                  curr_kv_head_idx * kv_cache_head_num_stride;
+              kv_cache_t* curr_v_cache =
+                  reinterpret_cast<kv_cache_t*>(input->value_cache) +
+                  curr_kv_head_idx * kv_cache_head_num_stride;
+              query_t* const q_tile_ptr =
+                  reinterpret_cast<query_t*>(input->query) +
+                  q_token_start_idx * q_token_num_stride +
+                  q_head_start_idx * q_head_num_stride;
+              size_t output_buffer_offset =
+                  q_token_start_idx * q_head_num * head_dim +
+                  q_head_start_idx * head_dim;
+              int32_t* curr_block_table =
+                  block_table + current_group_idx * block_table_stride;
+              const float* curr_alibi_slopes =
+                  (alibi_slopes != nullptr ? alibi_slopes + q_head_start_idx
+                                           : nullptr);
+              const c10::BFloat16* curr_s_aux =
+                  (s_aux != nullptr ? s_aux + q_head_start_idx : nullptr);
+
+              // copy the Q tile to q_buffer, the logical layout of q_buffer is
+              // [actual_q_token_num, actual_q_heads_per_kv, head_dim]
+              {
+                attn_impl.copy_q_heads_tile(
+                    q_tile_ptr, q_buffer, actual_q_token_num,
+                    actual_q_heads_per_kv, q_token_num_stride,
+                    q_head_num_stride, scale);
+              }
+
+              if (use_sink) {
+                alignas(64) float s_aux_fp32[16];
+#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
+                // ARM without native BF16 support: manual conversion
+                for (int i = 0; i < 16; ++i) {
+                  s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
+                }
+#else
+                // All other platforms have BF16Vec16 available
+                vec_op::BF16Vec16 vec_bf16(curr_s_aux);
+                vec_op::FP32Vec16 vec_fp32(vec_bf16);
+                vec_fp32.save(s_aux_fp32);
+#endif
+
+                float* __restrict__ curr_sum_buffer = sum_buffer;
+                float* __restrict__ curr_max_buffer = max_buffer;
+                for (int32_t token_idx = 0; token_idx < actual_q_token_num;
+                     ++token_idx) {
+                  for (int32_t head_idx = 0; head_idx < actual_q_heads_per_kv;
+                       ++head_idx) {
+                    curr_sum_buffer[head_idx] = 1.0f;
+                    curr_max_buffer[head_idx] = s_aux_fp32[head_idx];
+                  }
+
+                  curr_sum_buffer += actual_q_heads_per_kv;
+                  curr_max_buffer += actual_q_heads_per_kv;
+                }
+              } else {
+                float* __restrict__ curr_sum_buffer = sum_buffer;
+                float* __restrict__ curr_max_buffer = max_buffer;
+                for (int32_t token_idx = 0; token_idx < actual_q_token_num;
+                     ++token_idx) {
+                  for (int32_t head_idx = 0; head_idx < actual_q_heads_per_kv;
+                       ++head_idx) {
+                    curr_sum_buffer[head_idx] = 0.0f;
+                    curr_max_buffer[head_idx] =
+                        std::numeric_limits<float>::lowest();
+                  }
+
+                  curr_sum_buffer += actual_q_heads_per_kv;
+                  curr_max_buffer += actual_q_heads_per_kv;
+                }
+              }
+
+              // compute loop
+              for (int32_t kv_tile_pos = rounded_kv_tile_start_pos;
+                   kv_tile_pos < rounded_kv_tile_end_pos;
+                   kv_tile_pos += kv_tile_size) {
+                const int32_t kv_tile_pos_left = kv_tile_pos;
+                const int32_t kv_tile_pos_right = std::min(
+                    kv_tile_pos_left + kv_tile_size, rounded_kv_tile_end_pos);
+                for (int32_t q_head_tile_token_offset = 0;
+                     q_head_tile_token_offset < actual_q_token_num;
+                     q_head_tile_token_offset += max_q_token_num_per_iter) {
+                  const int32_t q_tile_pos_left =
+                      q_tile_start_pos + q_head_tile_token_offset;
+                  const int32_t q_tile_token_num =
+                      std::min(max_q_token_num_per_iter,
+                               actual_q_token_num - q_head_tile_token_offset);
+                  const int32_t q_tile_head_offset =
+                      q_head_tile_token_offset * actual_q_heads_per_kv;
+                  const int32_t q_tile_head_num =
+                      q_tile_token_num * actual_q_heads_per_kv;
+                  const int32_t q_tile_pos_right =
+                      q_tile_pos_left + q_tile_token_num;
+                  const auto [actual_kv_tile_pos_left,
+                              actual_kv_tile_pos_right] =
+                      AttentionScheduler::calcu_kv_tile_pos(
+                          kv_tile_pos_left, kv_tile_pos_right, q_tile_pos_left,
+                          q_tile_pos_right, sliding_window_left,
+                          sliding_window_right);
+                  const int32_t q_iter_idx =
+                      q_head_tile_token_offset / max_q_token_num_per_iter;
+
+                  if (actual_kv_tile_pos_right <= actual_kv_tile_pos_left) {
+                    continue;
+                  }
+
+                  // align kv_pos to blocksize_alignment
+                  const auto [aligned_actual_kv_tile_pos_left,
+                              aligned_actual_kv_tile_pos_right] =
+                      AttentionScheduler::align_kv_tile_pos(
+                          actual_kv_tile_pos_left, actual_kv_tile_pos_right,
+                          blocksize_alignment);
+                  const int32_t actual_kv_token_num =
+                      aligned_actual_kv_tile_pos_right -
+                      aligned_actual_kv_tile_pos_left;
+
+                  //   std::printf("\tq_iter_idx: %d, q_token_start: %d,
+                  //   q_token_end: %d, q_token_num: %d, q_head_num: %d,
+                  //   q_pos_start: %d, q_pos_end: %d, kv_pos_start: %d,
+                  //   kv_pos_end: %d\n",
+                  //             q_iter_idx, q_token_start_idx +
+                  //             q_head_tile_token_offset,  q_token_start_idx +
+                  //             q_head_tile_token_offset + q_tile_token_num,
+                  //             q_tile_token_num, q_tile_head_num,
+                  //             q_tile_pos_left, q_tile_pos_right,
+                  //             aligned_actual_kv_tile_pos_left,
+                  //             aligned_actual_kv_tile_pos_right);
+
+                  // Move buffers
+                  q_buffer_t* curr_q_heads_buffer =
+                      q_buffer + q_tile_head_offset * head_dim;
+                  float* curr_partial_q_buffer =
+                      partial_q_buffer + q_tile_head_offset * head_dim;
+                  float* curr_max_buffer = max_buffer + q_tile_head_offset;
+                  float* curr_sum_buffer = sum_buffer + q_tile_head_offset;
+
+                  bool debug_info = false;
+                  //   bool debug_info = (
+                  //     q_head_start_idx == 4 &&
+                  //     (q_token_start_idx + q_head_tile_token_offset) <=
+                  //     4
+                  //     && (q_token_start_idx + q_head_tile_token_offset +
+                  //     q_tile_token_num) > 4
+                  //   );
+                  // if (debug_info) {
+                  //   std::printf("\tq_iter_idx: %d, q_token_start: %d,"
+                  //   "q_token_end: %d, q_token_num: %d, q_head_num: %d,"
+                  //   "q_pos_start: %d, q_pos_end: %d, kv_pos_start: %d,"
+                  //   "kv_pos_end: %d\n",
+                  //             q_iter_idx, q_token_start_idx +
+                  //             q_head_tile_token_offset,  q_token_start_idx
+                  //             + q_head_tile_token_offset +
+                  //             q_tile_token_num, q_tile_token_num,
+                  //             q_tile_head_num, q_tile_pos_left,
+                  //             q_tile_pos_right,
+                  //             aligned_actual_kv_tile_pos_left,
+                  //             aligned_actual_kv_tile_pos_right);
+                  // }
+
+                  attn_impl.template execute_attention<Attention>(
+                      curr_q_heads_buffer, curr_k_cache, curr_v_cache,
+                      logits_buffer, curr_partial_q_buffer, curr_max_buffer,
+                      curr_sum_buffer, curr_block_table,
+                      aligned_actual_kv_tile_pos_left,
+                      aligned_actual_kv_tile_pos_right, actual_kv_token_num,
+                      kv_cache_block_num_stride, q_tile_head_num,
+                      q_tile_token_num, q_tile_pos_left, actual_q_heads_per_kv,
+                      block_size, sliding_window_left, sliding_window_right,
+                      scale, softcap_scale, curr_alibi_slopes,
+                      first_iter_flag[q_iter_idx], use_sink, debug_info);
+                  first_iter_flag[q_iter_idx] = false;
+                }
+              }
+
+              // write back partial results to output buffer or reduction buffer
+              {
+                if (curr_spilt_id == -1) {
+                  final_output(partial_q_buffer,
+                               reinterpret_cast<query_t*>(input->output) +
+                                   output_buffer_offset,
+                               sum_buffer, actual_q_heads_per_kv,
+                               actual_q_token_num, q_head_num);
+                } else {
+                  const int32_t stride =
+                      actual_q_heads_per_kv * split_kv_q_token_num_threshold;
+                  buffer_manager.update(kv_head_idx, total_reduction_split_num,
+                                        head_dim, stride, sizeof(float));
+                  volatile bool* split_flag_buffer =
+                      buffer_manager.get_reduce_flag_buffer() + curr_spilt_id;
+                  float* split_output_buffer =
+                      buffer_manager.get_reduce_output_buffer() +
+                      curr_spilt_id * stride * head_dim;
+                  float* split_max_buffer =
+                      buffer_manager.get_reduce_max_buffer() +
+                      curr_spilt_id * stride;
+                  float* split_sum_buffer =
+                      buffer_manager.get_reduce_sum_buffer() +
+                      curr_spilt_id * stride;
+
+                  partial_output(partial_q_buffer, max_buffer, sum_buffer,
+                                 q_head_tile_size, split_output_buffer,
+                                 split_max_buffer, split_sum_buffer,
+                                 split_flag_buffer);
+                }
+              }
+            }
+          }
+        } else {
+          task_idx -= workitem_groups_counter_num;
+          const int32_t kv_head_idx = task_idx / reduction_item_num;
+          const int32_t item_offset = task_idx % reduction_item_num;
+          ReductionWorkItemGroup* const curr_workitem_groups =
+              reduction_items + item_offset;
+          const int32_t curr_output_token_idx =
+              curr_workitem_groups->q_token_id_start;
+          const int32_t curr_output_token_num =
+              curr_workitem_groups->q_token_id_num;
+          const int32_t curr_split_id = curr_workitem_groups->split_start_id;
+          const int32_t curr_split_num = curr_workitem_groups->split_num;
+          const int32_t current_group_idx = curr_workitem_groups->req_id;
+          const int32_t curr_output_head_num =
+              curr_output_token_num * actual_q_heads_per_kv;
+
+          const int32_t q_start = input->query_start_loc[current_group_idx];
+          const int32_t q_token_start_idx = q_start + curr_output_token_idx;
+          const int32_t q_head_start_idx = kv_head_idx * actual_q_heads_per_kv;
+          size_t output_buffer_offset =
+              q_token_start_idx * q_head_num * head_dim +
+              q_head_start_idx * head_dim;
+
+          const int32_t stride =
+              actual_q_heads_per_kv * split_kv_q_token_num_threshold;
+          buffer_manager.update(kv_head_idx, total_reduction_split_num,
+                                head_dim, stride, sizeof(float));
+          volatile bool* split_flag_buffer =
+              buffer_manager.get_reduce_flag_buffer() + curr_split_id;
+          float* split_output_buffer =
+              buffer_manager.get_reduce_output_buffer() +
+              curr_split_id * stride * head_dim;
+          float* split_max_buffer =
+              buffer_manager.get_reduce_max_buffer() + curr_split_id * stride;
+          float* split_sum_buffer =
+              buffer_manager.get_reduce_sum_buffer() + curr_split_id * stride;
+
+          reduce_splits(split_output_buffer, split_max_buffer, split_sum_buffer,
+                        split_flag_buffer, stride, curr_output_head_num,
+                        curr_split_num);
+          final_output(
+              split_output_buffer,
+              reinterpret_cast<query_t*>(input->output) + output_buffer_offset,
+              split_sum_buffer, actual_q_heads_per_kv, curr_output_token_num,
+              q_head_num);
+        }
+      }
+    }
+    // Reset counter for next call
+    input->metadata->reset_counter();
+  }
+
+  void reduce_splits(float* __restrict__ split_output_buffer,
+                     float* __restrict__ split_max_buffer,
+                     float* __restrict__ split_sum_buffer,
+                     volatile bool* __restrict__ flags,
+                     const int32_t head_num_per_split,
+                     const int32_t curr_head_num, const int32_t split_num) {
+#ifdef DEFINE_FAST_EXP
+    DEFINE_FAST_EXP
+#endif
+    // restrict curr_head_num <= 16 in the scheduler
+    // elems in split_max_buffer, split_sum_buffer are not cache alignment, use
+    // local buffers to reduce false-sharing
+    alignas(64) float local_max[16];
+    alignas(64) float local_sum[16];
+
+    float* __restrict__ curr_split_output_buffer = split_output_buffer;
+    float* __restrict__ curr_split_max_buffer = split_max_buffer;
+    float* __restrict__ curr_split_sum_buffer = split_sum_buffer;
+    constexpr int32_t head_dim_group_num = head_dim / 16;
+    for (int32_t split_idx = 0; split_idx < split_num; ++split_idx) {
+      while (!flags[split_idx]) {
+#ifdef FAST_SPINNING
+        FAST_SPINNING
+#else
+        std::this_thread::yield();
+#endif
+      }
+      std::atomic_thread_fence(std::memory_order_acquire);
+
+      if (split_idx > 0) {
+        float* __restrict__ curr_output_buffer = split_output_buffer;
+        float* __restrict__ curr_split_output_buffer_iter =
+            curr_split_output_buffer;
+        for (int32_t head_idx = 0; head_idx < curr_head_num; ++head_idx) {
+          float final_max = local_max[head_idx];
+          float curr_max = curr_split_max_buffer[head_idx];
+          float final_sum = local_sum[head_idx];
+          float curr_sum = curr_split_sum_buffer[head_idx];
+          float* __restrict__ non_scale_output_iter =
+              final_max > curr_max ? curr_output_buffer
+                                   : curr_split_output_buffer_iter;
+          float* __restrict__ scale_output_iter =
+              final_max > curr_max ? curr_split_output_buffer_iter
+                                   : curr_output_buffer;
+          float rescale_factor = final_max > curr_max ? curr_max - final_max
+                                                      : final_max - curr_max;
+
+#ifdef DEFINE_FAST_EXP
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+          rescale_factor_vec = fast_exp(rescale_factor_vec);
+          rescale_factor = rescale_factor_vec.get_last_elem();
+#else
+          rescale_factor = std::exp(rescale_factor);
+          vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
+#endif
+
+          local_sum[head_idx] = final_max > curr_max
+                                    ? final_sum + rescale_factor * curr_sum
+                                    : rescale_factor * final_sum + curr_sum;
+
+          final_max = std::max(final_max, curr_max);
+          local_max[head_idx] = final_max;
+          for (int32_t i = 0; i < head_dim_group_num; ++i) {
+            vec_op::FP32Vec16 non_scale_vec(non_scale_output_iter);
+            vec_op::FP32Vec16 scale_vec(scale_output_iter);
+            vec_op::FP32Vec16 final_vec =
+                non_scale_vec + scale_vec * rescale_factor_vec;
+            final_vec.save(curr_output_buffer);
+
+            non_scale_output_iter += 16;
+            scale_output_iter += 16;
+            curr_output_buffer += 16;
+          }
+          curr_split_output_buffer_iter += head_dim;
+        }
+      } else {
+        vec_op::FP32Vec16 final_max(split_max_buffer);
+        final_max.save(local_max);
+        vec_op::FP32Vec16 final_sum(split_sum_buffer);
+        final_sum.save(local_sum);
+      }
+
+      curr_split_output_buffer += head_num_per_split * head_dim;
+      curr_split_max_buffer += head_num_per_split;
+      curr_split_sum_buffer += head_num_per_split;
+    }
+    // write back final max and sum
+    for (int32_t i = 0; i < curr_head_num; ++i) {
+      split_max_buffer[i] = local_max[i];
+      split_sum_buffer[i] = local_sum[i];
+    }
+  }
+
+  void partial_output(float* __restrict__ partial_output_buffer,
+                      float* __restrict__ partial_max_buffer,
+                      float* __restrict__ partial_sum_buffer,
+                      int32_t curr_head_num,
+                      float* __restrict__ split_output_buffer,
+                      float* __restrict__ split_max_buffer,
+                      float* __restrict__ split_sum_buffer,
+                      volatile bool* __restrict__ flag) {
+    float* __restrict__ curr_partial_output_buffer = partial_output_buffer;
+    float* __restrict__ curr_split_output_buffer = split_output_buffer;
+    constexpr int32_t head_dim_group_num = head_dim / 16;
+    for (int32_t i = 0; i < curr_head_num; ++i) {
+      split_max_buffer[i] = partial_max_buffer[i];
+      split_sum_buffer[i] = partial_sum_buffer[i];
+      for (int32_t j = 0; j < head_dim_group_num; ++j) {
+        vec_op::FP32Vec16 vec(curr_partial_output_buffer);
+        vec.save(curr_split_output_buffer);
+
+        curr_partial_output_buffer += 16;
+        curr_split_output_buffer += 16;
+      }
+    }
+    std::atomic_thread_fence(std::memory_order_release);
+    *flag = true;
+  }
+
+  void final_output(float* __restrict__ partial_q_buffer,
+                    query_t* __restrict__ curr_output_buffer,
+                    float* __restrict__ sum_buffer,
+                    const int32_t q_heads_per_kv,
+                    const int32_t actual_q_token_num,
+                    const int32_t q_head_num) {
+    // final output
+    using output_vec_t = typename VecTypeTrait<query_t>::vec_t;
+
+    float* __restrict__ curr_partial_output_buffer = partial_q_buffer;
+    float* __restrict__ curr_sum_buffer = sum_buffer;
+    constexpr int32_t group_num_per_head = head_dim / 16;
+    const int32_t partial_q_buffer_stride = q_heads_per_kv * head_dim;
+    const int32_t output_buffer_stride = q_head_num * head_dim;
+    for (int32_t token_idx = 0; token_idx < actual_q_token_num; ++token_idx) {
+      float* __restrict__ curr_partial_output_buffer_iter =
+          curr_partial_output_buffer;
+      query_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+      for (int32_t head_idx = 0; head_idx < q_heads_per_kv; ++head_idx) {
+        vec_op::FP32Vec16 inv_sum_scale_vec(1.0 / *curr_sum_buffer);
+
+        for (int32_t i = 0; i < group_num_per_head; ++i) {
+          vec_op::FP32Vec16 vec(curr_partial_output_buffer_iter);
+          // divide the final sum val of softmax here
+          vec = inv_sum_scale_vec * vec;
+
+          // cast to query type
+          output_vec_t output_vec(vec);
+          output_vec.save(curr_output_buffer_iter);
+
+          // update
+          curr_partial_output_buffer_iter += 16;
+          curr_output_buffer_iter += 16;
+        }
+
+        // update
+        curr_sum_buffer += 1;
+      }
+
+      // update
+      curr_partial_output_buffer += partial_q_buffer_stride;
+      curr_output_buffer += output_buffer_stride;
+    }
+  }
+};
+
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_attn_macros.h
new file mode 100644
index 000000000000..6458e4341937
--- /dev/null
+++ b/csrc/cpu/cpu_attn_macros.h
@@ -0,0 +1,63 @@
+#ifndef CPU_ATTN_MACROS_H
+#define CPU_ATTN_MACROS_H
+
+// x86_64
+#ifdef __x86_64__
+  #define FAST_SPINNING _mm_pause();
+
+  #ifdef __AVX512F__
+    #define DEFINE_FAST_EXP                                                    \
+      const __m512 vec_factorial_1 = _mm512_set1_ps(0.999999701f);             \
+      const __m512 vec_factorial_2 = _mm512_set1_ps(0.499991506f);             \
+      const __m512 vec_factorial_3 = _mm512_set1_ps(0.166676521f);             \
+      const __m512 vec_factorial_4 = _mm512_set1_ps(0.0418978221f);            \
+      const __m512 vec_factorial_5 = _mm512_set1_ps(0.00828929059f);           \
+      const __m512 vec_exp_log2ef =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b));                  \
+      const __m512 vec_half = _mm512_set1_ps(0.5f);                            \
+      const __m512 vec_one = _mm512_set1_ps(1.f);                              \
+      const __m512 vec_zero = _mm512_set1_ps(0.f);                             \
+      const __m512 vec_two = _mm512_set1_ps(2.f);                              \
+      const __m512 vec_ln2f =                                                  \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218));                  \
+      const __m512 vec_ln_flt_min =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));                  \
+      const __m512 vec_ln_flt_max =                                            \
+          _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));                  \
+      const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);                   \
+      const int n_mantissa_bits = 23;                                          \
+      auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__((              \
+                          always_inline)) {                                    \
+        __m512 values = vec.reg;                                               \
+        auto less_ln_flt_min_mask =                                            \
+            _mm512_cmp_ps_mask(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/);      \
+        auto vec_src = _mm512_min_ps(values, vec_ln_flt_max);                  \
+        vec_src = _mm512_max_ps(vec_src, vec_ln_flt_min);                      \
+        auto vec_fx = _mm512_fmadd_ps(vec_src, vec_exp_log2ef, vec_half);      \
+        auto vec_fx_i = _mm512_cvt_roundps_epi32(                              \
+            vec_fx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);                \
+        vec_fx = _mm512_cvtepi32_ps(vec_fx_i);                                 \
+        auto vec_exp_poly = _mm512_fnmadd_ps(vec_fx, vec_ln2f, vec_src);       \
+        auto vec_res =                                                         \
+            _mm512_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4);   \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1);     \
+        vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_one);             \
+        auto vec_exp_number = _mm512_sub_ps(vec_fx, vec_one);                  \
+        auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);            \
+        auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);    \
+        vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); \
+        auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);             \
+        vec_two_pow_n = _mm512_mask_blend_ps(less_ln_flt_min_mask,             \
+                                             vec_two_pow_n, vec_zero);         \
+        vec_res = _mm512_mul_ps(vec_res, vec_two_pow_n);                       \
+        vec_res = _mm512_mul_ps(vec_res, vec_two);                             \
+        vec_op::FP32Vec16 res(vec_res);                                        \
+        return res;                                                            \
+      };
+  #endif
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/cpu_attn_vec.hpp b/csrc/cpu/cpu_attn_vec.hpp
new file mode 100644
index 000000000000..479313f0e19f
--- /dev/null
+++ b/csrc/cpu/cpu_attn_vec.hpp
@@ -0,0 +1,248 @@
+#ifndef CPU_ATTN_VEC_HPP
+#define CPU_ATTN_VEC_HPP
+
+#include "cpu_attn_impl.hpp"
+
+namespace cpu_attention {
+
+namespace {
+// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
+template <typename kv_cache_t>
+class TileGemm82 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    switch (m_size) {
+      case 1:
+        gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 2:
+        gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 3:
+      case 4:
+        gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 5:
+      case 6:
+        gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 7:
+      case 8:
+        gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(float* __restrict__ a_tile,
+                         kv_cache_t* __restrict__ b_tile,
+                         float* __restrict__ c_tile, const int64_t lda,
+                         const int64_t ldb, const int64_t ldc,
+                         const int32_t block_size, const int32_t dynamic_k_size,
+                         const bool accum_c) {
+    static_assert(0 < M <= 8);
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+
+    kv_cache_t* __restrict__ curr_b_0 = b_tile;
+    kv_cache_t* __restrict__ curr_b_1 = b_tile + 16;
+    float* __restrict__ curr_c_0 = c_tile;
+    float* __restrict__ curr_c_1 = c_tile + 16;
+
+    vec_op::FP32Vec16 c_regs[M * 2];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      float* __restrict__ curr_m_c_1 = curr_c_1;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0);
+        c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1);
+
+        // update
+        curr_m_c_0 += ldc;
+        curr_m_c_1 += ldc;
+      });
+    }
+
+    float* __restrict__ curr_a = a_tile;
+    for (int32_t k = 0; k < dynamic_k_size; ++k) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+      load_vec_t b_1_reg(curr_b_1);
+      vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
+
+      float* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        float v = *curr_m_a;
+        vec_op::FP32Vec16 a_reg(v);
+        c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
+        c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += ldb;
+      curr_b_1 += ldb;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i * 2].save(curr_c_0);
+      c_regs[i * 2 + 1].save(curr_c_1);
+
+      // update
+      curr_c_0 += ldc;
+      curr_c_1 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// This is a general but naive implementation based on vector instructions
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      32;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      32;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 8;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VEC;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemm82<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+
+  // Copy q to q_buffer and cast it to fp32
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      float* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, float scale) {
+    static_assert(head_dim % 16 == 0);
+    constexpr int32_t unroll_size = head_dim / 16;
+    using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+    vec_op::FP32Vec16 scale_vec(scale);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
+        scalar_t* __restrict__ curr_q =
+            src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
+        float* __restrict__ curr_q_buffer =
+            q_buffer + q_num_idx * q_heads_per_kv * head_dim +
+            q_head_idx * head_dim;
+
+        vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
+          load_vec_t vec(curr_q);
+          vec_op::FP32Vec16 fp32_vec(vec);
+          fp32_vec = fp32_vec * scale_vec;
+          fp32_vec.save(curr_q_buffer);
+
+          curr_q += 16;
+          curr_q_buffer += 16;
+        });
+      }
+    }
+  }
+
+  // reshape K as column-major and V as row-major
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key as column-major
+          const scalar_t* key_start_ptr = key +
+                                          token_idx * key_token_num_stride +
+                                          head_idx * key_head_num_stride;
+          scalar_t* key_cache_start_ptr =
+              key_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_cache_start_ptr[j] = key_start_ptr[i];
+          }
+        }
+        {
+          // Write Value as row-major
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset * head_dim;
+          std::memcpy(value_cache_start_ptr, value_start_ptr,
+                      sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_attn_vec16.hpp b/csrc/cpu/cpu_attn_vec16.hpp
new file mode 100644
index 000000000000..7402312c0924
--- /dev/null
+++ b/csrc/cpu/cpu_attn_vec16.hpp
@@ -0,0 +1,171 @@
+#ifndef CPU_ATTN_VEC16_HPP
+#define CPU_ATTN_VEC16_HPP
+
+#include "cpu_attn_vec.hpp"
+
+namespace cpu_attention {
+
+namespace {
+// 16-1-16 pattern, 16 regs for A, 1 regs for B, 16 regs for C, [16, K] @ [k,
+// 16]
+template <typename kv_cache_t>
+class TileGemm161 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    switch (m_size) {
+      case 1:
+        gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 2:
+        gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 3:
+      case 4:
+        gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 5:
+      case 6:
+        gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 7:
+      case 8:
+        gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                      dynamic_k_size, accum_c);
+        break;
+      case 9:
+      case 10:
+      case 11:
+      case 12:
+        gemm_micro<12>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                       dynamic_k_size, accum_c);
+        break;
+      case 13:
+      case 14:
+      case 15:
+      case 16:
+        gemm_micro<16>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
+                       dynamic_k_size, accum_c);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(float* __restrict__ a_tile,
+                         kv_cache_t* __restrict__ b_tile,
+                         float* __restrict__ c_tile, const int64_t lda,
+                         const int64_t ldb, const int64_t ldc,
+                         const int32_t block_size, const int32_t dynamic_k_size,
+                         const bool accum_c) {
+    static_assert(0 < M <= 16);
+    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
+
+    kv_cache_t* __restrict__ curr_b_0 = b_tile;
+    float* __restrict__ curr_c_0 = c_tile;
+
+    vec_op::FP32Vec16 c_regs[M];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i] = vec_op::FP32Vec16(curr_m_c_0);
+
+        // update
+        curr_m_c_0 += ldc;
+      });
+    }
+
+    float* __restrict__ curr_a = a_tile;
+    for (int32_t k = 0; k < dynamic_k_size; ++k) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+
+      float* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        float v = *curr_m_a;
+        vec_op::FP32Vec16 a_reg(v);
+        c_regs[i] = c_regs[i] + a_reg * fp32_b_0_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += ldb;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i].save(curr_c_0);
+
+      // update
+      curr_c_0 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// This is a general but naive implementation based on vector instructions
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VEC16, scalar_t, head_dim>
+    : public AttentionImpl<ISA::VEC, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      16;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      16;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = 16;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VEC16;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemm161<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+};
+}  // namespace cpu_attention
+
+#endif
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 982f7c07a13b..7ddf028e6e13 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -40,6 +40,23 @@ namespace vec_op {
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
+// Function to get the timestamp using RDTSCP
+FORCE_INLINE uint64_t bench_timestamp() {
+  unsigned int cycles_low, cycles_high;
+  asm volatile(
+      ".intel_syntax noprefix\n\t"
+      "CPUID\n\t"        // Serialize instruction stream to ensure previous
+                         // instructions complete
+      "RDTSCP\n\t"       // Read TSC and core ID
+      "mov %0, edx\n\t"  // Store high 32 bits of TSC
+      "mov %1, eax\n\t"  // Store low 32 bits of TSC
+      ".att_syntax"
+      : "=r"(cycles_high), "=r"(cycles_low)::"rax", "rbx", "rcx",
+        "rdx"  // Clobbered registers
+  );
+  return (uint64_t)cycles_high << 32 | cycles_low;
+}
+
 namespace {
 template <typename T, T... indexes, typename F>
 constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
@@ -407,6 +424,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   float reduce_min() const { return _mm512_reduce_min_ps(reg); }
 
+  float get_last_elem() const { return _mm512_cvtss_f32(reg); }
+
   template <int group_size>
   float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
@@ -446,9 +465,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
 
-  explicit FP32Vec16(const FP32Vec16& data)
-      : reg_low(data.reg_low), reg_high(data.reg_high) {}
-
   explicit FP32Vec16(const FP32Vec4& data)
       : reg_low((__m256)_mm256_inserti128_si256(
             _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)),
@@ -504,6 +520,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                      _mm256_div_ps(reg_high, b.reg_high));
   }
 
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_max_ps(reg_low, b.reg_low),
+                     _mm256_max_ps(reg_high, b.reg_high));
+  }
+
+  float reduce_max() const {
+    __m256 v = _mm256_max_ps(reg_low, reg_high);
+    // Permute to compare elements within 128-bit lanes
+    __m256 v_shuffled = _mm256_permute_ps(
+        v, 0b00001011);  // Swap halves within each 128-bit lane
+    __m256 v_max = _mm256_max_ps(v, v_shuffled);
+
+    v_shuffled = _mm256_permute_ps(
+        v_max, 0b00000001);  // Shuffle elements within each 128-bit lane
+    v_max = _mm256_max_ps(v_max, v_shuffled);
+
+    // Permute to compare elements between 128-bit lanes
+    v_shuffled =
+        _mm256_permute2f128_ps(v_max, v_max, 0b00000001);  // Swap 128-bit lanes
+    v_max = _mm256_max_ps(v_max, v_shuffled);
+
+    // At this point, the maximum value is present in all elements of v_max.
+    // Extract the first element for the scalar result.
+    return _mm256_cvtss_f32(v_max);  // Extract the lowest 32-bit float
+  }
+
   float reduce_sum() const {
     FP32Vec8 low = FP32Vec8(reg_low);
     FP32Vec8 high = FP32Vec8(reg_high);
@@ -642,7 +684,7 @@ inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
 inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
     : reg(_mm256_insertf128_si256(
           _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg),
-          FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+          FP16Vec8(FP32Vec8(v.reg_high)).reg, 1)) {}
 #endif
 
 #ifdef __AVX512BF16__
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index bb43aeee2eaf..02a8072ccf30 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -5,6 +5,7 @@
 #include "common/memory.hpp"
 
 #include "dnnl_helper.h"
+#include "scratchpad_manager.h"
 
 static dnnl::engine& default_engine() {
   static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
@@ -22,23 +23,6 @@ void release_dnnl_matmul_handler(int64_t handler) {
   delete ptr;
 }
 
-DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
-  this->realloc(allocation_unit * 128);
-}
-
-void DNNLScratchPadManager::realloc(size_t new_size) {
-  new_size = round(new_size);
-  if (new_size > size_) {
-    ptr_ = std::aligned_alloc(64, new_size);
-    size_ = new_size;
-  }
-}
-
-DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
-  static DNNLScratchPadManager manager;
-  return &manager;
-}
-
 template <typename KT, typename VT>
 class DNNLPrimitiveCache {
  public:
diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h
index 58ffe7a19bd4..b841ed73058e 100644
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@@ -59,30 +59,6 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
   return DNNLType<std::decay_t<T>>::type;
 }
 
-class DNNLScratchPadManager {
- public:
-  static constexpr size_t allocation_unit = 4 * 1024 * 1024;  // 4KB
-
-  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
-
-  DNNLScratchPadManager();
-
-  template <typename T>
-  T* get_data() {
-    return reinterpret_cast<T*>(ptr_);
-  }
-
-  static size_t round(size_t size) {
-    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
-  }
-
-  void realloc(size_t new_size);
-
- private:
-  size_t size_;
-  void* ptr_;
-};
-
 class DNNLMatMulPrimitiveHandler {
  public:
   virtual ~DNNLMatMulPrimitiveHandler() = default;
diff --git a/csrc/cpu/scratchpad_manager.cpp b/csrc/cpu/scratchpad_manager.cpp
new file mode 100644
index 000000000000..05cd435f34b7
--- /dev/null
+++ b/csrc/cpu/scratchpad_manager.cpp
@@ -0,0 +1,23 @@
+#include <cstdlib>
+
+#include "scratchpad_manager.h"
+
+DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void DNNLScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    if (ptr_ != nullptr) {
+      std::free(ptr_);
+    }
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
+  static DNNLScratchPadManager manager;
+  return &manager;
+}
diff --git a/csrc/cpu/scratchpad_manager.h b/csrc/cpu/scratchpad_manager.h
new file mode 100644
index 000000000000..0ecf59192f84
--- /dev/null
+++ b/csrc/cpu/scratchpad_manager.h
@@ -0,0 +1,31 @@
+#ifndef SCRATCHPAD_MANAGER_H
+#define SCRATCHPAD_MANAGER_H
+
+#include <cstddef>
+#include <cstdio>
+
+class DNNLScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024;  // 4KB
+
+  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
+
+  DNNLScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
+
+#endif
diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp
index 7e64e1c52198..e43aa203777a 100644
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -192,7 +192,7 @@ class SHMManager {
                       const int group_size)
       : _rank(rank),
         _group_size(group_size),
-        _thread_num(torch::get_num_threads()),
+        _thread_num(omp_get_max_threads()),
         _shm_names({""}),
         _shared_mem_ptrs({nullptr}),
         _shm_ctx(nullptr) {
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 9df19d1ac392..9fefd88cd9b0 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -74,25 +74,38 @@ at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2,
                                      const std::optional<at::Tensor>& bias,
                                      at::ScalarType out_dtype, bool is_vnni);
 
+torch::Tensor get_scheduler_metadata(
+    const int64_t num_req, const int64_t num_heads_q,
+    const int64_t num_heads_kv, const int64_t head_dim,
+    const torch::Tensor& seq_lens, at::ScalarType dtype,
+    const torch::Tensor& query_start_loc, const bool casual,
+    const int64_t window_size, const std::string& isa_hint,
+    const bool enable_kv_split);
+
+void cpu_attn_reshape_and_cache(const torch::Tensor& key,
+                                const torch::Tensor& value,
+                                torch::Tensor& key_cache,
+                                torch::Tensor& value_cache,
+                                const torch::Tensor& slot_mapping,
+                                const std::string& isa);
+
+void cpu_attention_with_kv_cache(
+    const torch::Tensor& query, const torch::Tensor& key_cache,
+    const torch::Tensor& value_cache, torch::Tensor& output,
+    const torch::Tensor& query_start_loc, const torch::Tensor& seq_lens,
+    const double scale, const bool causal,
+    const std::optional<torch::Tensor>& alibi_slopes,
+    const int64_t sliding_window_left, const int64_t sliding_window_right,
+    const torch::Tensor& block_table, const double softcap,
+    const torch::Tensor& scheduler_metadata,
+    const std::optional<torch::Tensor>& s_aux);
+
+// Note: just for avoiding importing errors
+void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
-  // Attention ops
-  // Compute the attention between an input query and the cached keys/values
-  // using PagedAttention.
-  ops.def(
-      "paged_attention_v1("
-      "    Tensor! out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-
-  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
-
   ops.def(
       "dynamic_4bit_int_moe("
       "Tensor x, Tensor topk_ids, Tensor topk_weights,"
@@ -102,20 +115,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
 
-  // PagedAttention V2.
-  ops.def(
-      "paged_attention_v2("
-      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
-
   // Activation ops
 
   // Activation function used in SwiGLU.
@@ -259,37 +258,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("int8_scaled_mm_with_quant", torch::kCPU,
            &int8_scaled_mm_with_quant);
 #endif
-}
 
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
-  // Cache ops
-  // Swap in (out) the cache blocks from src to dst.
-  cache_ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
-  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
-
-  // Copy the cache blocks from src to dst.
-  cache_ops.def(
-      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "Tensor block_mapping) -> ()");
-  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
-
-  // Reshape the key and value tensors and cache them.
-  cache_ops.def(
-      "reshape_and_cache(Tensor key, Tensor value,"
-      "                  Tensor! key_cache, Tensor! value_cache,"
-      "                  Tensor slot_mapping,"
-      "                  str kv_cache_dtype,"
-      "                  Tensor k_scale, Tensor v_scale) -> ()");
-  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
-
-  cache_ops.def(
-      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
-      "                     Tensor! kv_cache,"
-      "                     Tensor slot_mapping,"
-      "                     str kv_cache_dtype,"
-      "                     Tensor scale) -> ()");
-  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
+  // CPU attention kernels
+  ops.def(
+      "get_scheduler_metadata(int num_req, int num_heads_q, int num_heads_kv, "
+      "int head_dim, Tensor seq_lens, ScalarType dtype, Tensor "
+      "query_start_loc, bool casual, int window_size, str isa_hint, bool "
+      "enable_kv_split) -> Tensor",
+      &get_scheduler_metadata);
+  ops.def(
+      "cpu_attn_reshape_and_cache(Tensor key, Tensor value, Tensor(a2!) "
+      "key_cache, Tensor(a3!) value_cache, Tensor slot_mapping, str "
+      "isa) -> ()",
+      &cpu_attn_reshape_and_cache);
+  ops.def(
+      "cpu_attention_with_kv_cache(Tensor query, Tensor key_cache, Tensor "
+      "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
+      "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
+      "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
+      "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
+      &cpu_attention_with_kv_cache);
+
+  // placeholders
+  ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 02514edce807..c5a48352e308 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -45,6 +45,16 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   // Memory node binding
   if (numa_available() != -1) {
     int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
+    // Verify all CPUs are on the same NUMA node
+    for (size_t i = 1; i < omp_cpu_ids.size(); ++i) {
+      int node_id = numa_node_of_cpu(omp_cpu_ids[i]);
+      TORCH_CHECK(node_id == mem_node_id, "CPU ", omp_cpu_ids[i],
+                  " is on NUMA node ", node_id, ", but CPU ",
+                  omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
+                  ". All CPUs should be on the same NUMA node for optimal "
+                  "performance. Memory will be bound to NUMA node ",
+                  mem_node_id, ".");
+    }
     bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
     bitmask* src_mask = numa_get_membind();
 
diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index fab6ca36d422..78dc840a98b6 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -3,14 +3,58 @@
 // need to be unsigned long long
 #include <iostream>
 
+#include "cumem_allocator_compat.h"
+
+#ifndef USE_ROCM
+static const char* PYARGS_PARSE = "KKKK";
+#else
+  #include <cstdlib>
+  #include <cerrno>
+  #include <climits>
+
+// Default chunk size 256MB for ROCm. Can be overridden at runtime by the
+// environment variable VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE, specified in megabytes
+// (MB). The env value is parsed with strtoull as an integer number of MB
+// (decimal or 0x hex). The parsed MB value is converted to bytes. If
+// parsing fails, the value is 0, or the multiplication would overflow,
+// the default (256MB) is used.
+static const unsigned long long DEFAULT_MEMCREATE_CHUNK_SIZE =
+    (256ULL * 1024ULL * 1024ULL);
+
+static unsigned long long get_memcreate_chunk_size() {
+  const char* env = getenv("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE");
+  if (!env) return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  char* endptr = nullptr;
+  errno = 0;
+  unsigned long long val_mb = strtoull(env, &endptr, 0);
+  if (endptr == env || errno != 0) {
+    // parsing failed, fallback to default
+    return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  }
+  if (val_mb == 0) return DEFAULT_MEMCREATE_CHUNK_SIZE;
+
+  const unsigned long long MB = 1024ULL * 1024ULL;
+  // guard against overflow when converting MB -> bytes
+  if (val_mb > (ULLONG_MAX / MB)) {
+    return DEFAULT_MEMCREATE_CHUNK_SIZE;
+  }
+  return val_mb * MB;
+}
+
+static inline unsigned long long my_min(unsigned long long a,
+                                        unsigned long long b) {
+  return a < b ? a : b;
+}
+
+static const char* PYARGS_PARSE = "KKKO";
+#endif
+
 extern "C" {
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
 #include <sys/types.h>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
 
 char error_msg[10240];  // 10KB buffer to store error messages
 CUresult no_error = CUresult(0);
@@ -49,7 +93,12 @@ void ensure_context(unsigned long long device) {
 }
 
 void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
+#ifndef USE_ROCM
                     CUmemGenericAllocationHandle* p_memHandle) {
+#else
+                    CUmemGenericAllocationHandle** p_memHandle,
+                    unsigned long long* chunk_sizes, size_t num_chunks) {
+#endif
   ensure_context(device);
   // Define memory allocation properties
   CUmemAllocationProp prop = {};
@@ -58,6 +107,7 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
   prop.location.id = device;
   prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
 
+#ifndef USE_ROCM
   // Allocate memory using cuMemCreate
   CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
   if (error_code != 0) {
@@ -67,6 +117,39 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
   if (error_code != 0) {
     return;
   }
+#else
+  for (auto i = 0; i < num_chunks; ++i) {
+    CUDA_CHECK(cuMemCreate(p_memHandle[i], chunk_sizes[i], &prop, 0));
+    if (error_code != 0) {
+      // Clean up previously created handles
+      for (auto j = 0; j < i; ++j) {
+        cuMemRelease(*(p_memHandle[j]));
+      }
+      return;
+    }
+  }
+  unsigned long long allocated_size = 0;
+  for (auto i = 0; i < num_chunks; ++i) {
+    void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
+    CUDA_CHECK(cuMemMap(map_addr, chunk_sizes[i], 0, *(p_memHandle[i]), 0));
+    if (error_code != 0) {
+      // unmap previously mapped chunks
+      unsigned long long unmapped_size = 0;
+      for (auto j = 0; j < i; ++j) {
+        void* unmap_addr = (void*)((uintptr_t)d_mem + unmapped_size);
+        cuMemUnmap(unmap_addr, chunk_sizes[j]);
+        unmapped_size += chunk_sizes[j];
+      }
+      // release all created handles
+      for (auto j = 0; j < num_chunks; ++j) {
+        cuMemRelease(*(p_memHandle[j]));
+      }
+      return;
+    }
+    allocated_size += chunk_sizes[i];
+  }
+#endif
+
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = device;
@@ -82,10 +165,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
 
 void unmap_and_release(unsigned long long device, ssize_t size,
                        CUdeviceptr d_mem,
+#ifndef USE_ROCM
                        CUmemGenericAllocationHandle* p_memHandle) {
+#else
+                       CUmemGenericAllocationHandle** p_memHandle,
+                       unsigned long long* chunk_sizes, size_t num_chunks) {
+#endif
   // std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
   // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
   ensure_context(device);
+#ifndef USE_ROCM
   CUDA_CHECK(cuMemUnmap(d_mem, size));
   if (error_code != 0) {
     return;
@@ -94,6 +183,30 @@ void unmap_and_release(unsigned long long device, ssize_t size,
   if (error_code != 0) {
     return;
   }
+#else
+  unsigned long long allocated_size = 0;
+  CUresult first_error = no_error;
+
+  for (auto i = 0; i < num_chunks; ++i) {
+    void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
+    CUresult status = cuMemUnmap(map_addr, chunk_sizes[i]);
+    if (status != no_error && first_error == no_error) {
+      first_error = status;
+    }
+    allocated_size += chunk_sizes[i];
+  }
+
+  for (auto i = 0; i < num_chunks; ++i) {
+    CUresult status = cuMemRelease(*(p_memHandle[i]));
+    if (status != no_error && first_error == no_error) {
+      first_error = status;
+    }
+  }
+
+  if (first_error != no_error) {
+    CUDA_CHECK(first_error);
+  }
+#endif
 }
 
 PyObject* create_tuple_from_c_integers(unsigned long long a,
@@ -120,6 +233,36 @@ PyObject* create_tuple_from_c_integers(unsigned long long a,
   return tuple;  // Return the created tuple
 }
 
+PyObject* create_tuple_from_c_mixed(unsigned long long a, unsigned long long b,
+                                    unsigned long long c,
+                                    CUmemGenericAllocationHandle** vec,
+                                    unsigned long long* chunk_sizes,
+                                    size_t num_chunks) {
+  PyObject* tuple = PyTuple_New(4);
+  if (!tuple) {
+    return NULL;
+  }
+
+  // PyObject* list = PyList_New(vec.size());
+  PyObject* list = PyList_New(num_chunks);
+  for (auto i = 0; i < num_chunks; ++i) {
+    PyObject* addr_size_pair = PyTuple_New(2);
+    PyObject* addr = PyLong_FromUnsignedLongLong((unsigned long long)(vec[i]));
+    PyObject* size =
+        PyLong_FromUnsignedLongLong((unsigned long long)(chunk_sizes[i]));
+    PyTuple_SetItem(addr_size_pair, 0, addr);
+    PyTuple_SetItem(addr_size_pair, 1, size);
+    PyList_SetItem(list, i, addr_size_pair);
+  }
+
+  PyTuple_SetItem(tuple, 0, PyLong_FromUnsignedLongLong(a));
+  PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
+  PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
+  PyTuple_SetItem(tuple, 3, list);
+
+  return tuple;
+}
+
 // ---------------------------------------------------------------------------
 // Our exported C functions that call Python:
 
@@ -147,14 +290,55 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
 
   CUdeviceptr d_mem;
+#ifndef USE_ROCM
   CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
   if (error_code != 0) {
     return nullptr;
   }
+#else
+  CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, granularity, 0, 0));
+  if (error_code != 0) {
+    return nullptr;
+  }
+#endif
+
+#ifndef USE_ROCM
   // allocate the CUmemGenericAllocationHandle
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)malloc(
           sizeof(CUmemGenericAllocationHandle));
+#else
+  // Make sure chunk size is aligned with hardware granularity. The base
+  // chunk size can be configured via environment variable
+  // ``VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE``; otherwise
+  // DEFAULT_MEMCREATE_CHUNK_SIZE is used.
+  size_t base_chunk = (size_t)get_memcreate_chunk_size();
+  size_t aligned_chunk_size =
+      ((base_chunk + granularity - 1) / granularity) * granularity;
+  size_t num_chunks =
+      (alignedSize + aligned_chunk_size - 1) / aligned_chunk_size;
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  for (auto i = 0; i < num_chunks; ++i) {
+    p_memHandle[i] = (CUmemGenericAllocationHandle*)malloc(
+        sizeof(CUmemGenericAllocationHandle));
+    if (p_memHandle[i] == nullptr) {
+      std::cerr << "ERROR: malloc failed for p_memHandle[" << i << "].\n";
+      for (auto j = 0; j < i; ++j) {
+        free(p_memHandle[j]);
+      }
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+    chunk_sizes[i] = (unsigned long long)my_min(
+        (unsigned long long)(alignedSize - i * aligned_chunk_size),
+        (unsigned long long)aligned_chunk_size);
+  }
+#endif
 
   if (!g_python_malloc_callback) {
     std::cerr << "ERROR: g_python_malloc_callback not set.\n";
@@ -164,9 +348,15 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   // Acquire GIL (not in stable ABI officially, but often works)
   PyGILState_STATE gstate = PyGILState_Ensure();
 
+#ifndef USE_ROCM
   PyObject* arg_tuple = create_tuple_from_c_integers(
       (unsigned long long)device, (unsigned long long)alignedSize,
       (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+#else
+  PyObject* arg_tuple = create_tuple_from_c_mixed(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, p_memHandle, chunk_sizes, num_chunks);
+#endif
 
   // Call g_python_malloc_callback
   PyObject* py_result =
@@ -182,7 +372,27 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   PyGILState_Release(gstate);
 
   // do the final mapping
+#ifndef USE_ROCM
   create_and_map(device, alignedSize, d_mem, p_memHandle);
+#else
+  create_and_map(device, alignedSize, d_mem, p_memHandle, chunk_sizes,
+                 num_chunks);
+  free(chunk_sizes);
+#endif
+
+  if (error_code != 0) {
+    // free address and the handle
+    CUDA_CHECK(cuMemAddressFree(d_mem, alignedSize));
+#ifndef USE_ROCM
+    free(p_memHandle);
+#else
+    for (size_t i = 0; i < num_chunks; ++i) {
+      free(p_memHandle[i]);
+    }
+    free(p_memHandle);
+#endif
+    return nullptr;
+  }
 
   return (void*)d_mem;
 }
@@ -206,36 +416,96 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
 
   if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
     PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    Py_XDECREF(py_result);
+    Py_XDECREF(py_ptr);
     return;
   }
 
   unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
   // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+  if (!PyArg_ParseTuple(py_result, PYARGS_PARSE, &recv_device, &recv_size,
                         &recv_d_mem, &recv_p_memHandle)) {
     // PyArg_ParseTuple sets an error if it fails
+    Py_XDECREF(py_result);
+    Py_XDECREF(py_ptr);
     return;
   }
 
-  PyGILState_Release(gstate);
+  // For ROCm, copy the Python list of (addr,size) pairs into C arrays while
+  // holding the GIL. Then release the GIL and call the unmap/release helper
+  // using the copied arrays. This avoids calling PyList_* APIs without the
+  // GIL (which is undefined behavior and can crash when called from other
+  // threads).
+  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
+#ifdef USE_ROCM
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    Py_DECREF(py_ptr);
+    Py_DECREF(py_result);
+    PyGILState_Release(gstate);
+    std::cerr << "ERROR: malloc failed for p_memHandle in my_free."
+              << std::endl;
+    return;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    Py_DECREF(py_ptr);
+    Py_DECREF(py_result);
+    PyGILState_Release(gstate);
+    std::cerr << "ERROR: malloc failed for chunk_sizes in my_free."
+              << std::endl;
+    return;
+  }
+  for (Py_ssize_t i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
+  }
 
-  // recv_size == size
-  // recv_device == device
+  // Drop temporary Python refs, then release the GIL before calling into
+  // non-Python APIs.
+  Py_DECREF(py_ptr);
+  Py_DECREF(py_result);
+  PyGILState_Release(gstate);
 
-  // Free memory
+  unmap_and_release(device, size, d_mem, p_memHandle, chunk_sizes, num_chunks);
+#else
+  // Non-ROCm path: simple integer handle already extracted; drop temporary
+  // Python refs while still holding the GIL, then release it.
+  Py_DECREF(py_ptr);
+  Py_DECREF(py_result);
+  PyGILState_Release(gstate);
 
-  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)recv_p_memHandle;
   unmap_and_release(device, size, d_mem, p_memHandle);
+#endif
 
   // free address and the handle
   CUDA_CHECK(cuMemAddressFree(d_mem, size));
-  if (error_code != 0) {
-    return;
+#ifndef USE_ROCM
+  free(p_memHandle);
+#else
+  for (auto i = 0; i < num_chunks; ++i) {
+    free(p_memHandle[i]);
   }
   free(p_memHandle);
+  free(chunk_sizes);
+#endif
 }
 
 // ---------------------------------------------------------------------------
@@ -271,19 +541,87 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
   }
 
   unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
   // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
-                        &recv_p_memHandle)) {
+  if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
     // PyArg_ParseTuple sets an error if it fails
     return nullptr;
   }
 
   CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+#ifndef USE_ROCM
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)recv_p_memHandle;
 
   unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
+#else
+  if (!PyList_Check(recv_p_memHandle)) {
+    PyErr_SetString(PyExc_TypeError,
+                    "Expected a list for the 4th argument on ROCm");
+    return nullptr;
+  }
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  if (num_chunks < 0) {
+    return nullptr;  // PyList_Size sets an exception on error.
+  }
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
+    return nullptr;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
+    return nullptr;
+  }
+  for (Py_ssize_t i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    if (item == nullptr || !PyTuple_Check(item) || PyTuple_Size(item) != 2) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      PyErr_SetString(
+          PyExc_TypeError,
+          "List items must be tuples of size 2 (handle_addr, size)");
+      return nullptr;
+    }
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    if (addr_py == nullptr || size_py == nullptr) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;  // PyTuple_GetItem sets an exception
+    }
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    if (PyErr_Occurred()) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+    chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
+    if (PyErr_Occurred()) {
+      free(p_memHandle);
+      free(chunk_sizes);
+      return nullptr;
+    }
+  }
+
+  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
+                    num_chunks);
+
+  free(p_memHandle);
+  free(chunk_sizes);
+#endif
 
   if (error_code != 0) {
     error_code = no_error;
@@ -301,19 +639,56 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
   }
 
   unsigned long long recv_device, recv_size;
-  unsigned long long recv_d_mem, recv_p_memHandle;
+  unsigned long long recv_d_mem;
+#ifndef USE_ROCM
+  unsigned long long recv_p_memHandle;
+#else
+  PyObject* recv_p_memHandle;
+#endif
   // Unpack the tuple into four C integers
-  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
-                        &recv_p_memHandle)) {
+  if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
     // PyArg_ParseTuple sets an error if it fails
     return nullptr;
   }
 
   CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+#ifndef USE_ROCM
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)recv_p_memHandle;
 
   create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
+#else
+  Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
+  CUmemGenericAllocationHandle** p_memHandle =
+      (CUmemGenericAllocationHandle**)malloc(
+          num_chunks * sizeof(CUmemGenericAllocationHandle*));
+  if (p_memHandle == nullptr) {
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
+    return nullptr;
+  }
+  unsigned long long* chunk_sizes =
+      (unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
+  if (chunk_sizes == nullptr) {
+    free(p_memHandle);
+    PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
+    return nullptr;
+  }
+  for (auto i = 0; i < num_chunks; ++i) {
+    PyObject* item = PyList_GetItem(recv_p_memHandle, i);
+    PyObject* addr_py = PyTuple_GetItem(item, 0);
+    PyObject* size_py = PyTuple_GetItem(item, 1);
+    p_memHandle[i] =
+        (CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
+    chunk_sizes[i] = PyLong_AsUnsignedLongLong(size_py);
+  }
+
+  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
+                 num_chunks);
+
+  free(p_memHandle);
+  free(chunk_sizes);
+#endif
 
   if (error_code != 0) {
     error_code = no_error;
diff --git a/csrc/cumem_allocator_compat.h b/csrc/cumem_allocator_compat.h
new file mode 100644
index 000000000000..74f4bc9eeadf
--- /dev/null
+++ b/csrc/cumem_allocator_compat.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#ifdef USE_ROCM
+////////////////////////////////////////
+// For compatibility with CUDA and ROCm
+////////////////////////////////////////
+  #include <hip/hip_runtime_api.h>
+
+extern "C" {
+  #ifndef CUDA_SUCCESS
+    #define CUDA_SUCCESS hipSuccess
+  #endif  // CUDA_SUCCESS
+
+// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html
+typedef unsigned long long CUdevice;
+typedef hipDeviceptr_t CUdeviceptr;
+typedef hipError_t CUresult;
+typedef hipCtx_t CUcontext;
+typedef hipStream_t CUstream;
+typedef hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle;
+typedef hipMemAllocationGranularity_flags CUmemAllocationGranularity_flags;
+typedef hipMemAllocationProp CUmemAllocationProp;
+typedef hipMemAccessDesc CUmemAccessDesc;
+
+  #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+  #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+  #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+  #define CU_MEM_ALLOC_GRANULARITY_MINIMUM hipMemAllocationGranularityMinimum
+
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+  #define CU_MEM_ALLOCATION_COMP_NONE 0x0
+
+// Error Handling
+// https://docs.nvidia.com/cuda/archive/11.4.4/cuda-driver-api/group__CUDA__ERROR.html
+CUresult cuGetErrorString(CUresult hipError, const char** pStr) {
+  *pStr = hipGetErrorString(hipError);
+  return CUDA_SUCCESS;
+}
+
+// Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html
+CUresult cuCtxGetCurrent(CUcontext* ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxGetCurrent(ctx);
+}
+
+CUresult cuCtxSetCurrent(CUcontext ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxSetCurrent(ctx);
+}
+
+// Primary Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html
+CUresult cuDevicePrimaryCtxRetain(CUcontext* ctx, CUdevice dev) {
+  return hipDevicePrimaryCtxRetain(ctx, dev);
+}
+
+// Virtual Memory Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  return hipMemAddressFree(ptr, size);
+}
+
+CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment,
+                             CUdeviceptr addr, unsigned long long flags) {
+  return hipMemAddressReserve(ptr, size, alignment, addr, flags);
+}
+
+CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size,
+                     const CUmemAllocationProp* prop,
+                     unsigned long long flags) {
+  return hipMemCreate(handle, size, prop, flags);
+}
+
+CUresult cuMemGetAllocationGranularity(
+    size_t* granularity, const CUmemAllocationProp* prop,
+    CUmemAllocationGranularity_flags option) {
+  return hipMemGetAllocationGranularity(granularity, prop, option);
+}
+
+CUresult cuMemMap(CUdeviceptr dptr, size_t size, size_t offset,
+                  CUmemGenericAllocationHandle handle,
+                  unsigned long long flags) {
+  return hipMemMap(dptr, size, offset, handle, flags);
+}
+
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle) {
+  return hipMemRelease(handle);
+}
+
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                        const CUmemAccessDesc* desc, size_t count) {
+  return hipMemSetAccess(ptr, size, desc, count);
+}
+
+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  return hipMemUnmap(ptr, size);
+}
+}  // extern "C"
+
+#else
+////////////////////////////////////////
+// Import CUDA headers for NVIDIA GPUs
+////////////////////////////////////////
+  #include <cuda_runtime_api.h>
+  #include <cuda.h>
+#endif
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 995374a50b03..9ae0ed975edd 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -88,3 +88,32 @@
 #define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(                                              \
       TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_VEC_SIZE(VEC_SIZE, ...) \
+  switch (VEC_SIZE) {                         \
+    case 16: {                                \
+      constexpr int vec_size = 16;            \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 8: {                                 \
+      constexpr int vec_size = 8;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 4: {                                 \
+      constexpr int vec_size = 4;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    case 2: {                                 \
+      constexpr int vec_size = 2;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+    default: {                                \
+      constexpr int vec_size = 1;             \
+      __VA_ARGS__();                          \
+      break;                                  \
+    }                                         \
+  }
diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu
new file mode 100644
index 000000000000..baff8363162e
--- /dev/null
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <cuda_runtime.h>
+#include <type_traits>
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "type_convert.cuh"
+
+#define CHECK_TYPE(x, st)                                              \
+  TORCH_CHECK(x.scalar_type() == st, #x " dtype is ", x.scalar_type(), \
+              ", while ", st, " is expected")
+#define CHECK_TH_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_TH_CUDA(x);    \
+  CHECK_CONTIGUOUS(x)
+
+#ifdef USE_ROCM
+  #define FINAL_MASK 0xffffffffffffffffULL
+
+  #if defined(HIP_VERSION) && HIP_VERSION < 70000000
+// On ROCm versions before 7.0, __syncwarp isn't defined. The below
+// implementation is copy/pasted from the implementation in ROCm 7.0
+__device__ inline void __syncwarp() {
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
+  __builtin_amdgcn_wave_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
+}
+  #endif
+#else
+  #define FINAL_MASK 0xffffffff
+#endif
+
+namespace tensorrt_llm::common {
+template <typename T, int num>
+struct packed_as;
+// Specialization for packed_as used in this kernel.
+template <>
+struct packed_as<uint, 1> {
+  using type = uint;
+};
+
+template <>
+struct packed_as<uint, 2> {
+  using type = uint2;
+};
+
+template <>
+struct packed_as<uint, 4> {
+  using type = uint4;
+};
+
+template <typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+  return val;
+}
+
+template <typename T>
+inline __device__ __host__ T divUp(T m, T n) {
+  return (m + n - 1) / n;
+}
+
+}  // namespace tensorrt_llm::common
+
+namespace tensorrt_llm::kernels {
+// NOTE(zhuhaoran): This kernel is adapted from TensorRT-LLM implementation,
+// with added support for passing the cos_sin_cache as an input.
+// https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu
+
+// Perform per-head QK Norm and RoPE in a single kernel.
+// scalar_t_in: data type of QKV and RMSNorm weights
+// scalar_t_cache: data type of cos/sin cache
+// head_dim: the dimension of each head
+// interleave: interleave=!is_neox.
+template <typename scalar_t_in, typename scalar_t_cache, int head_dim,
+          bool interleave>
+__global__ void fusedQKNormRopeKernel(
+    void* qkv_void,                  // Combined QKV tensor
+    int const num_heads_q,           // Number of query heads
+    int const num_heads_k,           // Number of key heads
+    int const num_heads_v,           // Number of value heads
+    float const eps,                 // Epsilon for RMS normalization
+    void const* q_weight_void,       // RMSNorm weights for query
+    void const* k_weight_void,       // RMSNorm weights for key
+    void const* cos_sin_cache_void,  // Pre-computed cos/sin cache
+    int64_t const* position_ids,     // Position IDs for RoPE
+    int const num_tokens             // Number of tokens
+) {
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
+  if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
+                std::is_same_v<scalar_t_cache, c10::BFloat16>) {
+    return;
+  } else {
+#endif
+
+    using Converter = vllm::_typeConvert<scalar_t_in>;
+    static_assert(Converter::exists,
+                  "Input QKV data type is not supported for this CUDA "
+                  "architecture or toolkit version.");
+    using T_in = typename Converter::hip_type;
+    using T2_in = typename Converter::packed_hip_type;
+
+    using CacheConverter = vllm::_typeConvert<scalar_t_cache>;
+    static_assert(CacheConverter::exists,
+                  "Cache data type is not supported for this CUDA architecture "
+                  "or toolkit version.");
+    using T_cache = typename CacheConverter::hip_type;
+
+    T_in* qkv = reinterpret_cast<T_in*>(qkv_void);
+    T_in const* q_weight = reinterpret_cast<T_in const*>(q_weight_void);
+    T_in const* k_weight = reinterpret_cast<T_in const*>(k_weight_void);
+    T_cache const* cos_sin_cache =
+        reinterpret_cast<T_cache const*>(cos_sin_cache_void);
+
+    int const warpsPerBlock = blockDim.x / 32;
+    int const warpId = threadIdx.x / 32;
+    int const laneId = threadIdx.x % 32;
+
+    // Calculate global warp index to determine which head/token this warp
+    // processes
+    int const globalWarpIdx = blockIdx.x * warpsPerBlock + warpId;
+
+    // Total number of attention heads (Q and K)
+    int const total_qk_heads = num_heads_q + num_heads_k;
+
+    // Determine which token and head type (Q or K) this warp processes
+    int const tokenIdx = globalWarpIdx / total_qk_heads;
+    int const localHeadIdx = globalWarpIdx % total_qk_heads;
+
+    // Skip if this warp is assigned beyond the number of tokens
+    if (tokenIdx >= num_tokens) return;
+
+    bool const isQ = localHeadIdx < num_heads_q;
+    int const headIdx = isQ ? localHeadIdx : localHeadIdx - num_heads_q;
+
+    int const num_heads = num_heads_q + num_heads_k + num_heads_v;
+
+    static_assert(head_dim % (32 * 2) == 0,
+                  "head_dim must be divisible by 64 (each warp processes one "
+                  "head, and each thread gets even number of "
+                  "elements)");
+    constexpr int numElemsPerThread = head_dim / 32;
+    float elements[numElemsPerThread];
+    constexpr int elemSizeBytes = numElemsPerThread * sizeof(__nv_bfloat16);
+    static_assert(elemSizeBytes % 4 == 0,
+                  "numSizeBytes must be a multiple of 4");
+    constexpr int vecSize =
+        elemSizeBytes /
+        4;  // Use packed_as<uint, vecSize> to perform loading/saving.
+    using vec_T = typename tensorrt_llm::common::packed_as<uint, vecSize>::type;
+
+    int offsetWarp;  // Offset for the warp
+    if (isQ) {
+      // Q segment: token offset + head offset within Q segment
+      offsetWarp = tokenIdx * num_heads * head_dim + headIdx * head_dim;
+    } else {
+      // K segment: token offset + entire Q segment + head offset within K
+      // segment
+      offsetWarp = tokenIdx * num_heads * head_dim + num_heads_q * head_dim +
+                   headIdx * head_dim;
+    }
+    int offsetThread = offsetWarp + laneId * numElemsPerThread;
+
+    // Sum of squares for RMSNorm
+    float sumOfSquares = 0.0f;
+
+    // Load.
+    {
+      vec_T vec = *reinterpret_cast<vec_T const*>(&qkv[offsetThread]);
+      constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
+#pragma unroll
+      for (int i = 0; i < num_packed_elems; i++) {
+        // Interpret the generic vector chunk as the specific packed type
+        T2_in packed_val = *(reinterpret_cast<T2_in*>(&vec) + i);
+        // Convert to float2 for computation
+        float2 vals = Converter::convert(packed_val);
+        sumOfSquares += vals.x * vals.x;
+        sumOfSquares += vals.y * vals.y;
+
+        elements[2 * i] = vals.x;
+        elements[2 * i + 1] = vals.y;
+      }
+    }
+
+    // Reduce sum across warp using the utility function
+    sumOfSquares = tensorrt_llm::common::warpReduceSum(sumOfSquares);
+
+    // Compute RMS normalization factor
+    float rms_rcp = rsqrtf(sumOfSquares / static_cast<float>(head_dim) + eps);
+
+    // Normalize elements
+#pragma unroll
+    for (int i = 0; i < numElemsPerThread; i++) {
+      int dim = laneId * numElemsPerThread + i;
+      float weight = isQ ? Converter::convert(q_weight[dim])
+                         : Converter::convert(k_weight[dim]);
+      elements[i] *= rms_rcp * weight;
+    }
+
+    // Apply RoPE to normalized elements
+    float elements2[numElemsPerThread];  // Additional buffer required for RoPE.
+
+    int64_t pos_id = position_ids[tokenIdx];
+
+    // Calculate cache pointer for this position - similar to
+    // pos_encoding_kernels.cu
+    T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
+    int const embed_dim = head_dim / 2;
+    T_cache const* cos_ptr = cache_ptr;
+    T_cache const* sin_ptr = cache_ptr + embed_dim;
+
+    if constexpr (interleave) {
+      // Perform interleaving. Use pre-computed cos/sin values.
+#pragma unroll
+      for (int i = 0; i < numElemsPerThread / 2; ++i) {
+        int const idx0 = 2 * i;
+        int const idx1 = 2 * i + 1;
+
+        float const val0 = elements[idx0];
+        float const val1 = elements[idx1];
+
+        int const dim_idx = laneId * numElemsPerThread + idx0;
+        int const half_dim = dim_idx / 2;
+        float const cos_val =
+            CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+        float const sin_val =
+            CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+        elements[idx0] = val0 * cos_val - val1 * sin_val;
+        elements[idx1] = val0 * sin_val + val1 * cos_val;
+      }
+    } else {
+      // Before data exchange with in warp, we need to sync.
+      __syncwarp();
+      // Get the data from the other half of the warp. Use pre-computed cos/sin
+      // values.
+#pragma unroll
+      for (int i = 0; i < numElemsPerThread; i++) {
+        elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
+        if (laneId < 16) {
+          elements2[i] = -elements2[i];
+        }
+
+        int dim_idx = laneId * numElemsPerThread + i;
+        dim_idx = (dim_idx * 2) % head_dim;
+        int half_dim = dim_idx / 2;
+        // Use pre-computed cos/sin from cache
+        float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
+        float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
+
+        elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
+      }
+      // __shfl_xor_sync does not provide memfence. Need to sync again.
+      __syncwarp();
+    }
+
+    // Store.
+    {
+      vec_T vec;
+      constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
+#pragma unroll
+      for (int i = 0; i < num_packed_elems; i++) {
+        // Convert from float2 back to the specific packed type
+        T2_in packed_val = Converter::convert(
+            make_float2(elements[2 * i], elements[2 * i + 1]));
+        // Place it into the generic vector
+        *(reinterpret_cast<T2_in*>(&vec) + i) = packed_val;
+      }
+      *reinterpret_cast<vec_T*>(&qkv[offsetThread]) = vec;
+    }
+
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
+  }
+#endif
+}
+
+// Borrowed from
+// https://github.com/flashinfer-ai/flashinfer/blob/8125d079a43e9a0ba463a4ed1b639cefd084cec9/include/flashinfer/pos_enc.cuh#L568
+#define DISPATCH_INTERLEAVE(interleave, INTERLEAVE, ...) \
+  if (interleave) {                                      \
+    const bool INTERLEAVE = true;                        \
+    __VA_ARGS__                                          \
+  } else {                                               \
+    const bool INTERLEAVE = false;                       \
+    __VA_ARGS__                                          \
+  }
+
+template <typename scalar_t_in, typename scalar_t_cache>
+void launchFusedQKNormRope(void* qkv, int const num_tokens,
+                           int const num_heads_q, int const num_heads_k,
+                           int const num_heads_v, int const head_dim,
+                           float const eps, void const* q_weight,
+                           void const* k_weight, void const* cos_sin_cache,
+                           bool const interleave, int64_t const* position_ids,
+                           cudaStream_t stream) {
+  constexpr int blockSize = 256;
+
+  int const warpsPerBlock = blockSize / 32;
+  int const totalQKHeads = num_heads_q + num_heads_k;
+  int const totalWarps = num_tokens * totalQKHeads;
+
+  int const gridSize = common::divUp(totalWarps, warpsPerBlock);
+  dim3 gridDim(gridSize);
+  dim3 blockDim(blockSize);
+
+  switch (head_dim) {
+    case 64:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    case 128:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    case 256:
+      DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+        fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
+            <<<gridDim, blockDim, 0, stream>>>(
+                qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
+                k_weight, cos_sin_cache, position_ids, num_tokens);
+      });
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Unsupported head dimension for fusedQKNormRope: ", head_dim);
+  }
+}
+}  // namespace tensorrt_llm::kernels
+
+void fused_qk_norm_rope(
+    torch::Tensor& qkv,       // Combined QKV tensor [num_tokens,
+                              // (num_heads_q+num_heads_k+num_heads_v)*head_dim]
+    int64_t num_heads_q,      // Number of query heads
+    int64_t num_heads_k,      // Number of key heads
+    int64_t num_heads_v,      // Number of value heads
+    int64_t head_dim,         // Dimension per head
+    double eps,               // Epsilon for RMS normalization
+    torch::Tensor& q_weight,  // RMSNorm weights for query [head_dim]
+    torch::Tensor& k_weight,  // RMSNorm weights for key [head_dim]
+    torch::Tensor& cos_sin_cache,  // Cos/sin cache [max_position, head_dim]
+    bool is_neox,                  // Whether RoPE is applied in Neox style
+    torch::Tensor& position_ids    // Position IDs for RoPE [num_tokens]
+) {
+  // Input validation
+  CHECK_INPUT(qkv);
+  CHECK_INPUT(position_ids);
+  CHECK_INPUT(q_weight);
+  CHECK_INPUT(k_weight);
+  CHECK_INPUT(cos_sin_cache);
+  CHECK_TYPE(position_ids, torch::kInt64);
+
+  TORCH_CHECK(qkv.dim() == 2,
+              "QKV tensor must be 2D: [num_tokens, "
+              "(num_heads_q+num_heads_k+num_heads_v)*head_dim]");
+  TORCH_CHECK(position_ids.dim() == 1, "Position IDs must be 1D: [num_tokens]");
+  TORCH_CHECK(q_weight.dim() == 1, "Query weights must be 1D: [head_dim]");
+  TORCH_CHECK(k_weight.dim() == 1, "Key weights must be 1D: [head_dim]");
+  TORCH_CHECK(cos_sin_cache.dim() == 2,
+              "Cos/sin cache must be 2D: [max_position, head_dim]");
+  TORCH_CHECK(q_weight.size(0) == head_dim,
+              "Query weights size must match head dimension");
+  TORCH_CHECK(k_weight.size(0) == head_dim,
+              "Key weights size must match head dimension");
+  TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
+              "Cos/sin cache dimension must match head_dim");
+  TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
+                  qkv.scalar_type() == k_weight.scalar_type(),
+              "qkv, q_weight and k_weight must have the same dtype");
+
+  int64_t num_tokens = qkv.size(0);
+  TORCH_CHECK(position_ids.size(0) == num_tokens,
+              "Number of tokens in position_ids must match QKV");
+
+  int64_t total_heads = num_heads_q + num_heads_k + num_heads_v;
+  TORCH_CHECK(
+      qkv.size(1) == total_heads * head_dim,
+      "QKV tensor size must match total number of heads and head dimension");
+
+  auto stream = at::cuda::getCurrentCUDAStream(qkv.get_device());
+
+  VLLM_DISPATCH_HALF_TYPES(qkv.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
+    using qkv_scalar_t = scalar_t;
+    VLLM_DISPATCH_FLOATING_TYPES(
+        cos_sin_cache.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
+          using cache_scalar_t = scalar_t;
+          tensorrt_llm::kernels::launchFusedQKNormRope<qkv_scalar_t,
+                                                       cache_scalar_t>(
+              qkv.data_ptr(), static_cast<int>(num_tokens),
+              static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
+              static_cast<int>(num_heads_v), static_cast<int>(head_dim),
+              static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
+              cos_sin_cache.data_ptr(), !is_neox,
+              reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
+              stream);
+        });
+  });
+}
\ No newline at end of file
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 8cfcf9f41283..48771e4b3aff 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -10,7 +10,7 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t>
+template <typename scalar_t, int VEC_SIZE>
 __global__ void rms_norm_kernel(
     scalar_t* __restrict__ out,          // [..., hidden_size]
     const scalar_t* __restrict__ input,  // [..., hidden_size]
@@ -21,7 +21,6 @@ __global__ void rms_norm_kernel(
   float variance = 0.0f;
   const scalar_t* input_row = input + blockIdx.x * input_stride;
 
-  constexpr int VEC_SIZE = 8;
   auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
     for (int i = 0; i < VEC_SIZE; ++i) {
@@ -45,10 +44,20 @@ __global__ void rms_norm_kernel(
   }
   __syncthreads();
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
-    out[blockIdx.x * hidden_size + idx] =
-        ((scalar_t)(x * s_variance)) * weight[idx];
+  scalar_t* out_row = out + blockIdx.x * hidden_size;
+  auto* v_in = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(input_row);
+  auto* v_w = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(weight);
+  auto* v_out = reinterpret_cast<vec_n_t<scalar_t, VEC_SIZE>*>(out_row);
+  for (int i = threadIdx.x; i < hidden_size / VEC_SIZE; i += blockDim.x) {
+    vec_n_t<scalar_t, VEC_SIZE> dst;
+    vec_n_t<scalar_t, VEC_SIZE> src1 = v_in[i];
+    vec_n_t<scalar_t, VEC_SIZE> src2 = v_w[i];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; j++) {
+      float x = static_cast<float>(src1.val[j]);
+      dst.val[j] = ((scalar_t)(x * s_variance)) * src2.val[j];
+    }
+    v_out[i] = dst;
   }
 }
 
@@ -168,16 +177,24 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
   int num_tokens = input_view.numel() / hidden_size;
   int64_t input_stride = input_view.stride(-2);
 
+  // For large num_tokens, use smaller blocks to increase SM concurrency.
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
   dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input_view.scalar_type(), "rms_norm_kernel", [&] {
-        vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
-            input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
-            hidden_size);
+        const int calculated_vec_size =
+            std::gcd(16 / sizeof(scalar_t), hidden_size);
+        const int block_size =
+            std::min(hidden_size / calculated_vec_size, max_block_size);
+        dim3 block(block_size);
+        VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+          vllm::rms_norm_kernel<scalar_t, vec_size><<<grid, block, 0, stream>>>(
+              out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
+              input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
+              hidden_size);
+        });
       });
 }
 
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
index 0f7f034ee180..0880b8d50a79 100644
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -18,7 +18,7 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t, typename fp8_type>
+template <typename scalar_t, typename fp8_type, int VEC_SIZE>
 __global__ void rms_norm_static_fp8_quant_kernel(
     fp8_type* __restrict__ out,          // [..., hidden_size]
     const scalar_t* __restrict__ input,  // [..., hidden_size]
@@ -31,7 +31,6 @@ __global__ void rms_norm_static_fp8_quant_kernel(
 
   const scalar_t* input_row = input + blockIdx.x * input_stride;
 
-  constexpr int VEC_SIZE = 8;
   auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
     for (int i = 0; i < VEC_SIZE; ++i) {
@@ -58,11 +57,18 @@ __global__ void rms_norm_static_fp8_quant_kernel(
   // invert scale to avoid division
   float const scale_inv = 1.0f / *scale;
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * input_stride + idx];
-    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
-    out[blockIdx.x * hidden_size + idx] =
-        scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+  auto* v_in = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(input_row);
+  auto* v_w = reinterpret_cast<const vec_n_t<scalar_t, VEC_SIZE>*>(weight);
+  for (int idx = threadIdx.x; idx < hidden_size / VEC_SIZE; idx += blockDim.x) {
+    vec_n_t<scalar_t, VEC_SIZE> src1 = v_in[idx];
+    vec_n_t<scalar_t, VEC_SIZE> src2 = v_w[idx];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; j++) {
+      float x = static_cast<float>(src1.val[j]);
+      float const out_norm = ((scalar_t)(x * s_variance)) * src2.val[j];
+      out[blockIdx.x * hidden_size + idx * VEC_SIZE + j] =
+          scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
+    }
   }
 }
 
@@ -188,20 +194,29 @@ void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
   int input_stride = input.stride(-2);
   int num_tokens = input.numel() / hidden_size;
 
+  // For large num_tokens, use smaller blocks to increase SM concurrency.
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
   dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "rms_norm_kernel_scalar_type", [&] {
         VLLM_DISPATCH_FP8_TYPES(
             out.scalar_type(), "rms_norm_kernel_fp8_type", [&] {
-              vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t>
-                  <<<grid, block, 0, stream>>>(
-                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
-                      input_stride, weight.data_ptr<scalar_t>(),
-                      scale.data_ptr<float>(), epsilon, num_tokens,
-                      hidden_size);
+              const int calculated_vec_size =
+                  std::gcd(16 / sizeof(scalar_t), hidden_size);
+              const int block_size =
+                  std::min(hidden_size / calculated_vec_size, max_block_size);
+              dim3 block(block_size);
+              VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+                vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t,
+                                                       vec_size>
+                    <<<grid, block, 0, stream>>>(
+                        out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                        input_stride, weight.data_ptr<scalar_t>(),
+                        scale.data_ptr<float>(), epsilon, num_tokens,
+                        hidden_size);
+              });
             });
       });
 }
diff --git a/csrc/ops.h b/csrc/ops.h
index 3f5cb799b774..f8bdc61aaa8e 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -92,6 +92,12 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
+void fused_qk_norm_rope(torch::Tensor& qkv, int64_t num_heads_q,
+                        int64_t num_heads_k, int64_t num_heads_v,
+                        int64_t head_dim, double eps, torch::Tensor& q_weight,
+                        torch::Tensor& k_weight, torch::Tensor& cos_sin_cache,
+                        bool is_neox, torch::Tensor& position_ids);
+
 void apply_repetition_penalties_(torch::Tensor& logits,
                                  const torch::Tensor& prompt_mask,
                                  const torch::Tensor& output_mask,
diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu
index 6fcd246f63c5..0c3bcf3b64b2 100644
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -279,17 +279,17 @@ __device__ __forceinline__ void token_bounds(int32_t n_tokens,
 }
 
 template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
-          int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
-          int NUM_STAGES = 3>
+          typename scale_t, int THREADS, typename Idx_t, bool CEIL_UE8M0,
+          int GROUP_SIZE = 128, int NUM_STAGES = 3>
 __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
     const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
-    float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
+    scale_t* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
     // sizes
     Idx_t E, Idx_t T, Idx_t H,
     // strides (in elements)
     Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
     Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
-    Idx_t stride_ys_g, Idx_t stride_counts_e) {
+    Idx_t stride_ys_g, Idx_t stride_ys_p, Idx_t stride_counts_e) {
 #ifndef USE_ROCM
   static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
 
@@ -466,9 +466,22 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
 
   __nv_fp8x4_e4m3* y_q_base_ptr =
       reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
-  auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
+
+  Idx_t scale_group_offset = 0;
+  if constexpr (std::is_same<scale_t, uint8_t>::value) {
+    // packed int32_t format
+    int pack_id = warp_position_scales / 4;
+    int scale_in_pack = warp_position_scales % 4;
+    scale_group_offset = pack_id * stride_ys_p + scale_in_pack * stride_ys_g;
+  } else {
+    scale_group_offset = warp_position_scales * stride_ys_g;
+  }
+
+  scale_t* const y_scale_base_ptr = _y_s + scale_group_offset;
 
   for (auto j = tokens_lower; j < tokens_upper; j++) {
+    int current_group_id = warp_position_scales;  // Running count of which
+                                                  // group is being processed
     const Idx_t base_ys = expert_id * stride_ys_e;
     auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
     __nv_fp8x4_e4m3* y_q_ptr =
@@ -509,7 +522,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
 
       __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
 
-      if constexpr (USE_UE8M0) {
+      if constexpr (CEIL_UE8M0) {
         y_s = hexp2(hceil(hlog2(y_s)));
       }
 
@@ -527,8 +540,24 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
       y_q_ptr += WARP_SIZE * stride_yq_h;
 
       if (!lane_id) {
-        *y_s_ptr = y_s;
-        y_s_ptr += stride_ys_g;
+        // Store scales.
+        if constexpr (std::is_same<scale_t, uint8_t>::value) {
+          // Packed UE8MO format. Remove Mantissa.
+          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
+
+          bool const jump_pack = (current_group_id + 1) % 4 == 0;
+          // Minus 3 because we need to get to the first group in the
+          // next pack.
+          y_s_ptr += jump_pack ? (stride_ys_p - 3) : stride_ys_g;
+
+        } else {
+          // float32 format
+          static_assert(std::is_same<scale_t, float>::value);
+          *y_s_ptr = y_s;
+          y_s_ptr += stride_ys_g;
+        }
+
+        current_group_id += 1;
       }
     }
   }
@@ -573,16 +602,21 @@ void persistent_masked_m_silu_mul_quant(
     const at::Tensor& tokens_per_expert,  // (E)
     at::Tensor& y_q,                      // (E, T, H) [OUT]
     at::Tensor& y_s,                      // (E, T, H//group_size) [OUT]
-    bool use_ue8m0) {
+    bool cast_scale_ue8m0) {
 #ifndef USE_ROCM
 
   // This kernel currently only supports H % 128 == 0 and assumes a
   // fixed GROUP_SIZE of 128.
+  static constexpr int GROUP_SIZE = 128;
+
   TORCH_CHECK(input.dtype() == torch::kBFloat16);
   TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
               y_q.dtype() == torch::kFloat8_e4m3fnuz);
-  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
-  TORCH_CHECK(input.size(-1) % 256 == 0);
+  TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);
+
+  bool const is_packed_ue8m0 =
+      (y_s.dtype() == torch::kInt32 && cast_scale_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kFloat32 || is_packed_ue8m0);
 
   using Idx_t = int64_t;
 
@@ -595,17 +629,18 @@ void persistent_masked_m_silu_mul_quant(
   Idx_t stride_yq_e = y_q.stride(0);
   Idx_t stride_yq_t = y_q.stride(1);
   Idx_t stride_yq_h = y_q.stride(2);
-  Idx_t stride_ys_e = y_s.stride(0);
-  Idx_t stride_ys_t = y_s.stride(1);
-  Idx_t stride_ys_g = y_s.stride(2);
 
   Idx_t stride_counts_e = tokens_per_expert.stride(0);
 
-  static constexpr int GROUP_SIZE = 128;
+  int const NUM_GROUPS = H / GROUP_SIZE;
 
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES)                 \
+  // TODO: Get this from cuda_arch ?
+  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
+
+  #define KERNEL(BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,  \
+                 STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, STAGES)                \
     static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE;                 \
     int sms = SILU_V2_BLOCK_COUNT;                                             \
     static constexpr int max_shared_mem_bytes =                                \
@@ -615,38 +650,86 @@ void persistent_masked_m_silu_mul_quant(
     VLLM_DISPATCH_FP8_TYPES(                                                   \
         y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] {        \
           vllm::silu_mul_fp8_quant_deep_gemm_kernel<                           \
-              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t,   \
-              USE_UE8M0, GROUP_SIZE, STAGES>                                   \
+              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, scale_t, THREAD_COUNT, \
+              Idx_t, CEIL_UE8M0, GROUP_SIZE, STAGES>                           \
               <<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>(  \
                   reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),          \
-                  (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),               \
+                  (fp8_t*)y_q.data_ptr(),                                      \
+                  reinterpret_cast<scale_t*>(y_s.data_ptr()),                  \
                   reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
                   T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e,       \
-                  stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t,          \
-                  stride_ys_g, stride_counts_e);                               \
+                  stride_yq_t, stride_yq_h, STRIDE_YS_E, STRIDE_YS_T,          \
+                  STRIDE_YS_G, STRIDE_YS_P, stride_counts_e);                  \
         });
 
-  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
-
-  if (!use_ue8m0) {
-    if (H >= 4096) {
-      static constexpr int NUM_STAGES = 4;
-      static constexpr int THREAD_COUNT = 256;
-      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
-    } else {
-      static constexpr int THREAD_COUNT = 32;
-      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
-    }
-  } else {
-    if (H >= 4096) {
-      static constexpr int NUM_STAGES = 4;
-      static constexpr int THREAD_COUNT = 256;
-      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
-    } else {
-      static constexpr int THREAD_COUNT = 32;
-      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
+  #define LAUNCH_ON_H(scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,         \
+                      STRIDE_YS_P, CEIL_UE8M0)                                \
+    if (H >= 4096 && (NUM_GROUPS % 8) == 0) {                                 \
+      /* 8 warp config */                                                     \
+      static constexpr int NUM_STAGES = 4;                                    \
+      static constexpr int THREAD_COUNT = 256;                                \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, NUM_STAGES); \
+    } else {                                                                  \
+      /* 1 warp config */                                                     \
+      static constexpr int THREAD_COUNT = 32;                                 \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, 2);          \
     }
+
+  Idx_t stride_ys_e = y_s.stride(0);
+  Idx_t stride_ys_t = y_s.stride(1);
+  Idx_t stride_ys_g = y_s.stride(2);
+  Idx_t stride_ys_p = 0;
+  if (!cast_scale_ue8m0) {
+    TORCH_CHECK(!is_packed_ue8m0);
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                false);
+    return;
+  }
+
+  if (!is_packed_ue8m0) {
+    // UE8M0 but not packed
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                true);
+    return;
   }
 
+  TORCH_CHECK(cast_scale_ue8m0 && is_packed_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kInt32);
+
+  // Int32 packed ue8m0 scales tensor.
+  // Let E, T, G be the number to experts, number of tokens and number of groups
+  // respectively. Let, E = 2, T = 4, G = 6, in this case the int32 scales
+  // tensor are of shape [1, 4, 2] and stride [8, 1, 4]. The scales are expected
+  // to be arranged as follows,
+  // [[T0G0-T0G1-T0G2-T0G3, T0G4-T0G5-X-X,],
+  //  [T1G0-T1G1-T1G2-T1G3, T1G4-T1G5-X-X,]
+  //  [T2G0-T2G1-T2G2-T2G3, T2G4-T2G5-X-X,]
+  //  [T3G0-T3G1-T3G2-T3G3, T3G4-T3G5-X-X,]]
+  // where, TxGy is the scale ue8m0 scale value of Token x, Group y.
+  //
+  // In memory (in bytes) the scale values are arranged as,
+  //  [T0G0, T0G1, T0G2, T0G3, T1G0, T1G2, T1G3, T1G4, T2G0, T2G1, T2G3, T2G4,
+  //   T3G0, T3G1, T3G2, T3G3, T0G4, T0G5, X, X, T1G4, T1G5, X, X, T2G4, T2G5,
+  //   X, X, T3G4, T3G5, X, X]
+  //
+  // An Int32 tensor of size [1, 4, 2] and stride [8, 1, 4] can be represented
+  // as an uint8 tensor of shape [1, 2, 4, 4] and stride [32, 16, 4, 1]. In
+  // english, ignoring the Experts dimension, the original int32 tensor is
+  // simply treated as two packed [4, 4] uint8 tensor (or two [4, 1] int32
+  // tensor). The following strides setting reflects this change. Caveat: This
+  // means that the G dimension is no longer contiguous. i.e. Note that to move
+  // from G3 to G4, we need to jump along the packing dimension. The kernel
+  // handles this case.
+
+  stride_ys_e *= sizeof(int32_t);
+  stride_ys_p = T * sizeof(int32_t);  // Packing dimension
+  stride_ys_t = sizeof(int32_t);
+  stride_ys_g = 1;
+
+  LAUNCH_ON_H(uint8_t, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+              true);
+
 #endif
 }
diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
index 8ba617a9e655..e607107b3e77 100644
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -247,22 +247,6 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
   return out;
 }
 
-torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                     c10::SymInt size_k, c10::SymInt size_n,
-                                     int64_t num_bits) {
-  int const pack_factor = 32 / num_bits;
-  auto options = torch::TensorOptions()
-                     .dtype(b_q_weight.dtype())
-                     .device(b_q_weight.device());
-  return torch::empty_symint(
-      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
-      options);
-}
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("awq_marlin_repack", &awq_marlin_repack);
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
-  m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
-}
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index 7c2d089a70d9..ad80d51ece94 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -321,22 +321,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
   return out;
 }
 
-torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                      torch::Tensor& perm, c10::SymInt size_k,
-                                      c10::SymInt size_n, int64_t num_bits) {
-  int const pack_factor = 32 / num_bits;
-  auto options = torch::TensorOptions()
-                     .dtype(b_q_weight.dtype())
-                     .device(b_q_weight.device());
-  return torch::empty_symint(
-      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
-      options);
-}
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("gptq_marlin_repack", &gptq_marlin_repack);
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
-  m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
-}
diff --git a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
index 5369d409f9b2..aff11326d78e 100644
--- a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
+++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
@@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
     });
 
     if (numel % 256 != 0) {
-        out = out.index({torch::indexing::Slice(0, numel / had_size)});
+        out = out.narrow(0, 0, numel / had_size);
     }
 
     if (inplace && out.data_ptr() != x.data_ptr()) {
diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
index 147eb8efc077..c40d49966271 100644
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -48,7 +48,8 @@ struct cutlass_3x_gemm_fp8_blockwise {
   using ElementBlockScale = float;
 
   using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<
-        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::GMMA::Major::MN, cute::GMMA::Major::K>;
 
   using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
   using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
index 4ff3e65f2b2e..b8433214be1b 100644
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -116,6 +116,26 @@ struct sm90_fp8_config_default {
                                ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };
 
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M8192_K6144 {
+  // M >= 8192, K >= 6144
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
 template <typename InType, typename OutType, bool EnableBias>
 struct sm90_fp8_config_M128 {
   // M in (64, 128]
@@ -273,6 +293,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
   using Cutlass3xGemmDefault =
       typename sm90_fp8_config_default<InType, OutType,
                                        EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM8192_K6144 =
+      typename sm90_fp8_config_M8192_K6144<InType, OutType,
+                                           EnableBias>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
       typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
 
@@ -291,6 +314,7 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
 
   uint32_t const m = a.size(0);
   uint32_t const n = b.size(1);
+  uint32_t const k = a.size(1);
 
   if (m <= 16) {
     // m in [1, 16]
@@ -312,6 +336,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
     // m in (64, 128]
     return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
         out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m >= 8192 && k >= 6144) {
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM8192_K6144>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
     return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 9c0f524dcab1..c3ae06a30e3e 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -175,6 +175,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
 
+  // Function for fused QK Norm and RoPE
+  ops.def(
+      "fused_qk_norm_rope(Tensor! qkv, int num_heads_q, "
+      "int num_heads_k, int num_heads_v, int head_dim, float eps, "
+      "Tensor q_weight, Tensor k_weight, Tensor cos_sin_cache, "
+      "bool is_neox, Tensor position_ids) -> ()");
+  ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope);
+
   // Apply repetition penalties to logits in-place
   ops.def(
       "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
diff --git a/csrc/type_convert.cuh b/csrc/type_convert.cuh
index 21b9d0ae515d..2678f69e19b6 100644
--- a/csrc/type_convert.cuh
+++ b/csrc/type_convert.cuh
@@ -29,6 +29,22 @@ struct _typeConvert {
   static constexpr bool exists = false;
 };
 
+template <>
+struct _typeConvert<float> {
+  static constexpr bool exists = true;
+  using hip_type = float;
+  using packed_hip_type = float2;
+  using packed_hip_type4 = float4;  // For 128-bit vectorization
+
+  __device__ static __forceinline__ float convert(hip_type x) { return x; }
+  __device__ static __forceinline__ float2 convert(packed_hip_type x) {
+    return x;
+  }
+  __device__ static __forceinline__ float4 convert(packed_hip_type4 x) {
+    return x;
+  }
+};
+
 #if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
 // CUDA < 12.0 runs into issues with packed type conversion
 template <>
@@ -37,41 +53,44 @@ struct _typeConvert<c10::Half> {
   using hip_type = __half;
   using packed_hip_type = __half2;
 
-  __device__ static inline float convert(hip_type x) { return __half2float(x); }
-  __device__ static inline float2 convert(packed_hip_type x) {
+  __device__ static __forceinline__ float convert(hip_type x) {
+    return __half2float(x);
+  }
+  __device__ static __forceinline__ float2 convert(packed_hip_type x) {
     return __half22float2(x);
   }
-  __device__ static inline hip_type convert(float x) {
+  __device__ static __forceinline__ hip_type convert(float x) {
     return __float2half_rn(x);
   }
-  __device__ static inline packed_hip_type convert(float2 x) {
+  __device__ static __forceinline__ packed_hip_type convert(float2 x) {
     return __float22half2_rn(x);
   }
 };
 
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) || defined(USE_ROCM)
 // CUDA_ARCH < 800 does not have BF16 support
-// TODO: Add in ROCm support once public headers handle bf16 maturely
+// ROCm 7.0+ supports bfloat16
 template <>
 struct _typeConvert<c10::BFloat16> {
   static constexpr bool exists = true;
   using hip_type = __nv_bfloat16;
   using packed_hip_type = __nv_bfloat162;
 
-  __device__ static inline float convert(hip_type x) {
+  __device__ static __forceinline__ float convert(hip_type x) {
     return __bfloat162float(x);
   }
-  __device__ static inline float2 convert(packed_hip_type x) {
+  __device__ static __forceinline__ float2 convert(packed_hip_type x) {
     return __bfloat1622float2(x);
   }
-  __device__ static inline hip_type convert(float x) {
+  __device__ static __forceinline__ hip_type convert(float x) {
     return __float2bfloat16(x);
   }
-  __device__ static inline packed_hip_type convert(float2 x) {
+  __device__ static __forceinline__ packed_hip_type convert(float2 x) {
     return __float22bfloat162_rn(x);
   }
 };
-  #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) ||
+          // defined(USE_ROCM)
 #endif    // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
           // 12000))
 
@@ -95,10 +114,15 @@ struct alignas(16) _f16Vec {
     if constexpr (width % 2 == 0) {
 #pragma unroll
       for (int i = 0; i < width; i += 2) {
-        T2 temp{data[i], data[i + 1]};
-        temp += T2{other.data[i], other.data[i + 1]};
-        data[i] = temp.x;
-        data[i + 1] = temp.y;
+        if constexpr (std::is_same_v<T2, float2>) {
+          data[i] += other.data[i];
+          data[i + 1] += other.data[i + 1];
+        } else {
+          T2 temp{data[i], data[i + 1]};
+          temp += T2{other.data[i], other.data[i + 1]};
+          data[i] = temp.x;
+          data[i + 1] = temp.y;
+        }
       }
     } else {
 #pragma unroll
@@ -111,10 +135,15 @@ struct alignas(16) _f16Vec {
     if constexpr (width % 2 == 0) {
 #pragma unroll
       for (int i = 0; i < width; i += 2) {
-        T2 temp{data[i], data[i + 1]};
-        temp *= T2{other.data[i], other.data[i + 1]};
-        data[i] = temp.x;
-        data[i + 1] = temp.y;
+        if constexpr (std::is_same_v<T2, float2>) {
+          data[i] *= other.data[i];
+          data[i + 1] *= other.data[i + 1];
+        } else {
+          T2 temp{data[i], data[i + 1]};
+          temp *= T2{other.data[i], other.data[i + 1]};
+          data[i] = temp.x;
+          data[i + 1] = temp.y;
+        }
       }
     } else {
 #pragma unroll
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index adaf8a3c5b08..4c961defaeda 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -17,6 +17,7 @@
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
 #   VLLM_CPU_AVX512BF16=false (default)|true
 #   VLLM_CPU_AVX512VNNI=false (default)|true
+#   VLLM_CPU_AMXBF16=false (default)|true
 #
 
 ######################### COMMON BASE IMAGE #########################
@@ -92,6 +93,9 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
 # Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
 ARG VLLM_CPU_AVX512VNNI=0
 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
+# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
+ARG VLLM_CPU_AMXBF16=0
+ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
 
 WORKDIR /workspace/vllm
 
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 06d229f315bd..731a97d93da1 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -15,6 +15,17 @@ RUN apt-get update -q -y && apt-get install -q -y \
 # Remove sccache
 RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+
+# Install UV
+RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
 
@@ -59,13 +70,15 @@ FROM base AS test
 
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 
-# Install vLLM
+# Install vLLM using uv (inherited from base stage)
+# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    --mount=type=cache,target=/root/.cache/uv \
     cd /install \
-    && pip install -U -r requirements/rocm.txt \
-    && pip install -U -r requirements/rocm-test.txt \
+    && uv pip install --system -r requirements/rocm.txt \
+    && uv pip install --system -r requirements/rocm-test.txt \
     && pip uninstall -y vllm \
-    && pip install *.whl
+    && uv pip install --system *.whl
 
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
@@ -89,14 +102,17 @@ RUN case "$(which python3)" in \
             rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
         *) ;; esac
 
-RUN python3 -m pip install --upgrade huggingface-hub[cli]
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --upgrade huggingface-hub[cli]
 
-# Install vLLM
+# Install vLLM using uv (inherited from base stage)
+# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    --mount=type=cache,target=/root/.cache/uv \
     cd /install \
-    && pip install -U -r requirements/rocm.txt \
+    && uv pip install --system -r requirements/rocm.txt \
     && pip uninstall -y vllm \
-    && pip install *.whl
+    && uv pip install --system *.whl
 
 ARG COMMON_WORKDIR
 
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 19f7fa7e1468..df4f9b6c26e7 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
 ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
 ARG PYTORCH_BRANCH="1c57644d"
@@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="9716b1b8"
+ARG AITER_BRANCH="59bd8ff2"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -19,6 +19,9 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ENV AITER_ROCM_ARCH=gfx942;gfx950
 
+# Required for RCCL in ROCm7.1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+
 ARG PYTHON_VERSION=3.12
 
 RUN mkdir -p /app
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 4e6ef8f5ca13..5d5b82c4fa5a 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
     libxext6 \
     libgl1 \
     lsb-release \
+    libaio-dev \
     numactl \
     wget \
     vim \
@@ -68,8 +69,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 RUN python3 -m pip install -e tests/vllm_test_utils
 
 # install nixl from source code
+ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip uninstall oneccl oneccl-devel -y
diff --git a/docs/.nav.yml b/docs/.nav.yml
index c103ed476d76..3151ea0e2ec2 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -46,7 +46,10 @@ nav:
       - contributing/model/multimodal.md
       - contributing/model/transcription.md
     - CI: contributing/ci
-    - Design Documents: design
+    - Design Documents:
+      - Plugins:
+        - design/*plugin*.md
+      - design/*
   - API Reference:
     - api/README.md
     - api/vllm
diff --git a/docs/README.md b/docs/README.md
index 0608794e7e65..0c279c19f96c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
 Where to get started with vLLM depends on the type of user. If you are looking to:
 
 - Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
-- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
-- Build vLLM, we recommend starting with [Developer Guide](./contributing)
+- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
+- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
 
 For information about the development of vLLM, see:
 
diff --git a/docs/assets/features/disagg_encoder/disagg_encoder_flow.png b/docs/assets/features/disagg_encoder/disagg_encoder_flow.png
new file mode 100644
index 000000000000..2951468c11d9
Binary files /dev/null and b/docs/assets/features/disagg_encoder/disagg_encoder_flow.png differ
diff --git a/docs/cli/bench/latency.md b/docs/cli/bench/latency.md
index 21ab13e63781..ea7ea7321ffc 100644
--- a/docs/cli/bench/latency.md
+++ b/docs/cli/bench/latency.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_latency.md"
+--8<-- "docs/argparse/bench_latency.inc.md"
diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md
index f7c415c6becb..f7dc8036cc26 100644
--- a/docs/cli/bench/serve.md
+++ b/docs/cli/bench/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_serve.md"
+--8<-- "docs/argparse/bench_serve.inc.md"
diff --git a/docs/cli/bench/sweep/plot.md b/docs/cli/bench/sweep/plot.md
index f29bffb64655..a101330e093c 100644
--- a/docs/cli/bench/sweep/plot.md
+++ b/docs/cli/bench/sweep/plot.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_plot.md"
+--8<-- "docs/argparse/bench_sweep_plot.inc.md"
diff --git a/docs/cli/bench/sweep/serve.md b/docs/cli/bench/sweep/serve.md
index 5b5f91a951ed..f0468f06fc28 100644
--- a/docs/cli/bench/sweep/serve.md
+++ b/docs/cli/bench/sweep/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_serve.md"
+--8<-- "docs/argparse/bench_sweep_serve.inc.md"
diff --git a/docs/cli/bench/sweep/serve_sla.md b/docs/cli/bench/sweep/serve_sla.md
index 5f8ab6005e50..5642ec67eb00 100644
--- a/docs/cli/bench/sweep/serve_sla.md
+++ b/docs/cli/bench/sweep/serve_sla.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_serve_sla.md"
+--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"
diff --git a/docs/cli/bench/throughput.md b/docs/cli/bench/throughput.md
index e4ff5ce43c9c..e7f618fb4d14 100644
--- a/docs/cli/bench/throughput.md
+++ b/docs/cli/bench/throughput.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_throughput.md"
+--8<-- "docs/argparse/bench_throughput.inc.md"
diff --git a/docs/cli/chat.md b/docs/cli/chat.md
index b006cb8de60d..0246bd431b10 100644
--- a/docs/cli/chat.md
+++ b/docs/cli/chat.md
@@ -1,5 +1,5 @@
 # vllm chat
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/chat.md"
+--8<-- "docs/argparse/chat.inc.md"
diff --git a/docs/cli/complete.md b/docs/cli/complete.md
index 400359acf4fb..eb2ffdaabac2 100644
--- a/docs/cli/complete.md
+++ b/docs/cli/complete.md
@@ -1,5 +1,5 @@
 # vllm complete
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/complete.md"
+--8<-- "docs/argparse/complete.inc.md"
diff --git a/docs/cli/run-batch.md b/docs/cli/run-batch.md
index f7d401b8dad2..758fbda28397 100644
--- a/docs/cli/run-batch.md
+++ b/docs/cli/run-batch.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/run-batch.md"
+--8<-- "docs/argparse/run-batch.inc.md"
diff --git a/docs/cli/serve.md b/docs/cli/serve.md
index 2c8f9d320f5d..35652fec587b 100644
--- a/docs/cli/serve.md
+++ b/docs/cli/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/serve.md"
+--8<-- "docs/argparse/serve.inc.md"
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 7ddd45799789..0735f452df96 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -1,7 +1,16 @@
 # Meetups
 
-We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+We host regular meetups around the world. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights.
 
+## Upcoming Meetups
+
+Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_project), join our [Slack](https://slack.vllm.ai), and follow vLLM on [Luma](https://luma.com/vLLM-Meetups) to get notified about new events.
+
+## Past Meetups
+
+Below you'll find slides and recordings from our previous meetups:
+
+- [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)
 - [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing)
@@ -25,4 +34,12 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share
 - [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
 - [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
 
-We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
+## Get Involved
+
+**Want to host or speak at a vLLM meetup?** We're always looking for speakers and sponsors for our meetups. Whether you want to:
+
+- Share your vLLM feature, use case, project extension, or deployment experience
+- Host a meetup in your city
+- Sponsor an event
+
+Please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md
index c1cc5577bc7a..baaf21f01f06 100644
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
 ## CLI Arguments
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
-To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
+To see the available options, take a look at the [CLI Reference](../cli/README.md)!
 
 ## Configuration file
 
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index dca01eab5b42..c9bc9cfe28a3 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -10,8 +10,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu
 - **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
 - **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
 
-[Benchmark CLI]: #benchmark-cli
-
 ## Benchmark CLI
 
 This section guides you through running benchmark tests with the extensive
@@ -985,7 +983,7 @@ each document has close to 512 tokens.
 
 Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
 with an embedding model, also set `--no_reranker`. Because in this case the query is
-treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
+treated as an individual prompt by the server, here we send `random_batch_size - 1` documents
 to account for the extra prompt which is the query. The token accounting to report the
 throughput numbers correctly is also adjusted.
 
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index f983c25f26ee..09fd85a466ee 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -95,7 +95,7 @@ when manually triggering a build on Buildkite. This branch accomplishes two thin
 to warm it up so that future builds are faster.
 
 <p align="center" width="100%">
-    <img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
+    <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
 </p>
 
 ## Update dependencies
diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md
index d8c40c519573..13f3edb7e1af 100644
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@@ -1,7 +1,7 @@
 # Summary
 
 !!! important
-    Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
+    Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
 
 vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
 
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index 795bd5507a61..a7b54f015c2d 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -56,13 +56,13 @@ The initialization code should look like this:
 
 ### Computation Code
 
-- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
+- Add a `embed_input_ids` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
 
 ```python
 class MyModel(nn.Module):
         ...
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         ... 
 ```
 
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 4e74afc688cf..c2ca199220a1 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -36,7 +36,7 @@ Further update the model as follows:
   
   More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it.
 
-- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
+- Implement [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
 
     ??? code
 
@@ -49,7 +49,7 @@ Further update the model as follows:
                 image_features = self.vision_encoder(image_input)
                 return self.multi_modal_projector(image_features)
 
-            def get_multimodal_embeddings(
+            def embed_multimodal(
                 self,
                 **kwargs: object,
             ) -> MultiModalEmbeddings | None:
@@ -69,7 +69,7 @@ Further update the model as follows:
 !!! note
     By default, vLLM merges the multimodal embeddings into text embeddings depending on the information of their locations defined in
     [PlaceholderRange][vllm.multimodal.inputs.PlaceholderRange] from input processing.
-    This logic can be found at [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings].
+    This logic can be found at [embed_input_ids][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids].
 
     You may override this method if additional logic is required for your model when merging embeddings. 
 
diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index a590ecd6a1a2..fca941acd507 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -249,7 +249,7 @@ No extra registration is required beyond having your model class available via t
 ## Examples in-tree
 
 - Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
-- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
+- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
 - Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
 
 ## Test with the API
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 7941b1f49ee8..7634cc0859ed 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -224,6 +224,6 @@ snakeviz expensive_function.prof
 
 Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
 
-- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
+- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elapsed times
 - VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
   collected objects for each gc.collect
diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
index 002935da5600..5f7cef1a87df 100644
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -29,8 +29,8 @@ pip install vllm
     - API Path: `/chat/completions`
     - Model: `qwen/Qwen1.5-0.5B-Chat`
 
-    ![](../../assets/deployment/chatbox-settings.png)
+    ![Chatbox settings screen](../../assets/deployment/chatbox-settings.png)
 
 1. Go to `Just chat`, and start to chat:
 
-    ![](../../assets/deployment/chatbox-chat.png)
+    ![Chatbot chat screen](../../assets/deployment/chatbox-chat.png)
diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
index 820ef0cbed9f..673cbf4b6a24 100644
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -46,12 +46,12 @@ And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compos
     - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
     - **Completion Mode**: `Completion`
 
-    ![](../../assets/deployment/dify-settings.png)
+    ![Dify settings screen](../../assets/deployment/dify-settings.png)
 
 1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
 
-    ![](../../assets/deployment/dify-create-chatbot.png)
+    ![Dify create chatbot screen](../../assets/deployment/dify-create-chatbot.png)
 
 1. Click the chatbot you just created to open the chat interface and start interacting with the model:
 
-    ![](../../assets/deployment/dify-chat.png)
+    ![Dify chat screen](../../assets/deployment/dify-chat.png)
diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md
index d39bb9a899c8..05df0dacd8f1 100644
--- a/docs/deployment/frameworks/hf_inference_endpoints.md
+++ b/docs/deployment/frameworks/hf_inference_endpoints.md
@@ -156,7 +156,7 @@ In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.o
 
 ## Advanced Deployment Details
 
-With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
+With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
 
 Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
 
@@ -167,4 +167,4 @@ The platform integrates seamlessly with the Hugging Face Hub, allowing you to de
 - Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
 - Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
 - Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
-- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
+- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index b56cf61e782c..66bf3b27d1f5 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -128,7 +128,7 @@ A [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper] instance wrap
 3. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, the wrapper will perform CUDA Graphs capture (if key does not exist, create
 a new entry and cache it) or replay (if key exists in the cache).
 
-The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and cenralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
+The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and centralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
 
 #### Nested Wrapper design
 
@@ -177,8 +177,9 @@ The following table lists backends that support full CUDA Graphs at the time of
 | FlashAttention v3 | `ALWAYS` | has unified routine for both batches, so `FULL` mode is good |
 | Triton Attention | `ALWAYS` | prefer `FULL_AND_PIECEWISE` since it has different kernels for prefill/mixed and pure decode batches |
 | AITER FlashAttention | `UNIFORM_BATCH`| |
-| FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | |
+| FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | Will be set to `UNIFORM_BATCH` when using TRTLLM attention on Blackwell |
 | FlashMLA | `UNIFORM_BATCH` | |
+| FlashInferMLA | `UNIFORM_BATCH` | |
 | AITER MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
 | CUTLASS MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
 | Mamba attention| `UNIFORM_SINGLE_TOKEN_DECODE` | |
@@ -218,16 +219,6 @@ outputs = model.generate(
 )
 ```
 
-### Migration from legacy flags
-
-Legacy `use_cudagraph` and `full_cuda_graph` are unified by `cudagraph_mode`:
-
-* `use_cudagraph=False` → `NONE`.
-* `use_cudagraph=True` and `full_cuda_graph=False` → `PIECEWISE`.
-* `full_cuda_graph=True` → directly set `FULL` and rely on the graceful fallback policy.
-
-As they are deprecated and will be removed in the next major or minor release, i.e., v0.11.0 or v1.0.0, we recommend using cudagraph_mode instead.
-
 ### Piecewise compilation and full graph custom passes (attention fusion, sequence parallelism)
 
 Unfortunately, some custom compile passes have to see the whole graph to be effective and hence aren't compatible with piecewise compilation. This includes `AttnFusionPass` and `SequenceParallelismPass`. As a short-term solution, we automatically disable piecewise compilation (by setting `splitting_ops=[]`) when attention fusion is enabled. We use CUDA Graph modes `FULL` or `FULL_DECODE_ONLY` (depending on backend support). However, this leads to another optimization incompatibility and confusing performance tradeoffs.
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 76df0d8d8a38..e1a96be6c344 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -19,9 +19,9 @@ The input activation format completely depends on the All2All Dispatch being use
 
 The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
 
-![](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png "FusedMoE Non-Batched")
+![FusedMoE Non-Batched](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png)
 
-![](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png "FusedMoE Batched")
+![FusedMoE Batched](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png)
 
 !!! note
     The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
@@ -57,7 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
 The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive`  and `finalize` functions.
 The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers.  Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts.  The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
 
-![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
+![FusedMoEPrepareAndFinalize Blocks](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png)
 
 ### FusedMoEPermuteExpertsUnpermute
 
@@ -88,7 +88,7 @@ The core FusedMoE implementation performs a series of operations. It would be in
 It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
 `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
 
-![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks")
+![FusedMoEPermuteExpertsUnpermute Blocks](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png)
 
 ### FusedMoEModularKernel
 
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index 2f4b17f191a5..91ab4deae71d 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -1,6 +1,6 @@
 # IO Processor Plugins
 
-IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
+IO Processor plugins are a feature that allows pre- and post-processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
 
 When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
 
diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md
index acf7fc245462..8eadeb386fcf 100644
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -411,7 +411,7 @@ Logits processor `update_state()` implementations should assume the following mo
 
         * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
 
-        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+        * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
 
 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
 
@@ -548,7 +548,7 @@ Built-in logits processors are always loaded when the vLLM engine starts. See th
 
 Review these logits processor implementations for guidance on writing built-in logits processors.
 
-Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model.
+Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforementioned logits processor programming model.
 
 * Allowed token IDs
 
diff --git a/docs/design/lora_resolver_plugins.md b/docs/design/lora_resolver_plugins.md
new file mode 100644
index 000000000000..bd0dc6dc9c7b
--- /dev/null
+++ b/docs/design/lora_resolver_plugins.md
@@ -0,0 +1,220 @@
+# LoRA Resolver Plugins
+
+This directory contains vLLM's LoRA resolver plugins built on the `LoRAResolver` framework.
+They automatically discover and load LoRA adapters from a specified local storage path, eliminating the need for manual configuration or server restarts.
+
+## Overview
+
+LoRA Resolver Plugins provide a flexible way to dynamically load LoRA adapters at runtime. When vLLM
+receives a request for a LoRA adapter that hasn't been loaded yet, the resolver plugins will attempt
+to locate and load the adapter from their configured storage locations. This enables:
+
+- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
+- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
+- **Automatic Discovery**: Seamless integration with existing LoRA workflows
+- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
+
+## Prerequisites
+
+Before using LoRA Resolver Plugins, ensure the following environment variables are configured:
+
+### Required Environment Variables
+
+1. **`VLLM_ALLOW_RUNTIME_LORA_UPDATING`**: Must be set to `true` or `1` to enable dynamic LoRA loading
+   ```bash
+   export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
+   ```
+
+2. **`VLLM_PLUGINS`**: Must include the desired resolver plugins (comma-separated list)
+   ```bash
+   export VLLM_PLUGINS=lora_filesystem_resolver
+   ```
+
+3. **`VLLM_LORA_RESOLVER_CACHE_DIR`**: Must be set to a valid directory path for filesystem resolver
+   ```bash
+   export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
+   ```
+
+### Optional Environment Variables
+
+- **`VLLM_PLUGINS`**: If not set, all available plugins will be loaded. If set to empty string, no plugins will be loaded.
+
+## Available Resolvers
+
+### lora_filesystem_resolver
+
+The filesystem resolver is installed with vLLM by default and enables loading LoRA adapters from a local directory structure.
+
+#### Setup Steps
+
+1. **Create the LoRA adapter storage directory**:
+   ```bash
+   mkdir -p /path/to/lora/adapters
+   ```
+
+2. **Set environment variables**:
+   ```bash
+   export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
+   export VLLM_PLUGINS=lora_filesystem_resolver
+   export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
+   ```
+
+3. **Start vLLM server**:
+   Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`.
+   ```bash
+   python -m vllm.entrypoints.openai.api_server \
+       --model your-base-model \
+       --enable-lora
+   ```
+
+#### Directory Structure Requirements
+
+The filesystem resolver expects LoRA adapters to be organized in the following structure:
+
+```text
+/path/to/lora/adapters/
+├── adapter1/
+│   ├── adapter_config.json
+│   ├── adapter_model.bin
+│   └── tokenizer files (if applicable)
+├── adapter2/
+│   ├── adapter_config.json
+│   ├── adapter_model.bin
+│   └── tokenizer files (if applicable)
+└── ...
+```
+
+Each adapter directory must contain:
+
+- **`adapter_config.json`**: Required configuration file with the following structure:
+  ```json
+  {
+    "peft_type": "LORA",
+    "base_model_name_or_path": "your-base-model-name",
+    "r": 16,
+    "lora_alpha": 32,
+    "target_modules": ["q_proj", "v_proj"],
+    "bias": "none",
+    "modules_to_save": null,
+    "use_rslora": false,
+    "use_dora": false
+  }
+  ```
+
+- **`adapter_model.bin`**: The LoRA adapter weights file
+
+#### Usage Example
+
+1. **Prepare your LoRA adapter**:
+   ```bash
+   # Assuming you have a LoRA adapter in /tmp/my_lora_adapter
+   cp -r /tmp/my_lora_adapter /path/to/lora/adapters/my_sql_adapter
+   ```
+
+2. **Verify the directory structure**:
+   ```bash
+   ls -la /path/to/lora/adapters/my_sql_adapter/
+   # Should show: adapter_config.json, adapter_model.bin, etc.
+   ```
+
+3. **Make a request using the adapter**:
+   ```bash
+   curl http://localhost:8000/v1/completions \
+       -H "Content-Type: application/json" \
+       -d '{
+           "model": "my_sql_adapter",
+           "prompt": "Generate a SQL query for:",
+           "max_tokens": 50,
+           "temperature": 0.1
+       }'
+   ```
+
+#### How It Works
+
+1. When vLLM receives a request for a LoRA adapter named `my_sql_adapter`
+2. The filesystem resolver checks if `/path/to/lora/adapters/my_sql_adapter/` exists
+3. If found, it validates the `adapter_config.json` file
+4. If the configuration matches the base model and is valid, the adapter is loaded
+5. The request is processed normally with the newly loaded adapter
+6. The adapter remains available for future requests
+
+## Advanced Configuration
+
+### Multiple Resolvers
+
+You can configure multiple resolver plugins to load adapters from different sources:
+
+'lora_s3_resolver' is an example of a custom resolver you would need to implement
+
+```bash
+export VLLM_PLUGINS=lora_filesystem_resolver,lora_s3_resolver
+```
+
+All listed resolvers are enabled; at request time, vLLM tries them in order until one succeeds.
+
+### Custom Resolver Implementation
+
+To implement your own resolver plugin:
+
+1. **Create a new resolver class**:
+   ```python
+   from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+   from vllm.lora.request import LoRARequest
+   
+   class CustomResolver(LoRAResolver):
+       async def resolve_lora(self, base_model_name: str, lora_name: str) -> Optional[LoRARequest]:
+           # Your custom resolution logic here
+           pass
+   ```
+
+2. **Register the resolver**:
+   ```python
+   def register_custom_resolver():
+       resolver = CustomResolver()
+       LoRAResolverRegistry.register_resolver("Custom Resolver", resolver)
+   ```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **"VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory"**
+   - Ensure the directory exists and is accessible
+   - Check file permissions on the directory
+
+2. **"LoRA adapter not found"**
+   - Verify the adapter directory name matches the requested model name
+   - Check that `adapter_config.json` exists and is valid JSON
+   - Ensure `adapter_model.bin` exists in the directory
+
+3. **"Invalid adapter configuration"**
+   - Verify `peft_type` is set to "LORA"
+   - Check that `base_model_name_or_path` matches your base model
+   - Ensure `target_modules` is properly configured
+
+4. **"LoRA rank exceeds maximum"**
+   - Check that `r` value in `adapter_config.json` doesn't exceed `max_lora_rank` setting
+
+### Debugging Tips
+
+1. **Enable debug logging**:
+   ```bash
+   export VLLM_LOGGING_LEVEL=DEBUG
+   ```
+
+2. **Verify environment variables**:
+   ```bash
+   echo $VLLM_ALLOW_RUNTIME_LORA_UPDATING
+   echo $VLLM_PLUGINS
+   echo $VLLM_LORA_RESOLVER_CACHE_DIR
+   ```
+
+3. **Test adapter configuration**:
+   ```bash
+   python -c "
+   import json
+   with open('/path/to/lora/adapters/my_adapter/adapter_config.json') as f:
+       config = json.load(f)
+   print('Config valid:', config)
+   "
+   ```
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 633e23eea33e..7663b82266f0 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 ## Fused MoE Experts Kernels
 
-The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
+The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
 
 Each kernel must be provided with one of the supported input activation formats.  Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
 
@@ -97,7 +97,7 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
 | trtllm                       | standard              | mxfp4,</br>nvfp4 | G(16),G(32)   | <sup>5</sup>                                                | N                     | Y       | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts]                                                                                                                                                                                                                      |
 | pallas                       | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe]                                                                                                                                                                                                                                    |
 | iterative                    | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe]                                                                                                                                                                                                                           |
-| rocm aiter moe               | standard              | fp8              | G(128),A,T    | silu, gelu                                                  | Y                     | N       | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_moe_impl]                                                                                                                                                                                           |
+| rocm aiter moe               | standard              | fp8              | G(128),A,T    | silu, gelu                                                  | Y                     | N       | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts]                                                                                                                                                                                           |
 | cpu_fused_moe                | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE]                                                                                                                                                                                                                             |
 | naive batched<sup>4</sup>    | batched               | int8,</br>fp8    | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts]                                                                                                                                                                                                         |
 
diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md
index 7a650d0e79c2..728a2c89901d 100644
--- a/docs/features/custom_arguments.md
+++ b/docs/features/custom_arguments.md
@@ -5,7 +5,7 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t
 Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
 
 !!! note
-    Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise invalid custom arguments can cause unexpected behaviour.
+    Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise, invalid custom arguments can cause unexpected behaviour.
 
 ## Offline Custom Arguments
 
diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md
index 52fcc44efacc..5ddef9db1611 100644
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@@ -71,7 +71,7 @@ Logits processor `update_state()` implementations should assume the following mo
 
         * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
 
-        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+        * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
 
 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
 
@@ -286,7 +286,7 @@ Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) wh
 
 ## Ways to Load Your Custom Logits Processor in vLLM
 
-Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
+Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits processors cannot be loaded on-demand for individual requests.
 
 This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
 
@@ -438,7 +438,7 @@ The examples below show how a user would pass a custom argument (`target_token`)
 
 ## Best Practices for Writing Custom Logits Processors
 
-Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
+Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus, it is important to implement these methods efficiently.
 
 * Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
     * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
@@ -465,4 +465,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke
 
     * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
 
-* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
+* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However, the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md
new file mode 100644
index 000000000000..7d40af706982
--- /dev/null
+++ b/docs/features/disagg_encoder.md
@@ -0,0 +1,75 @@
+# Disaggregated Encoder
+
+A **disaggregated encoder** runs the vision-encoder stage of a multimodal LLM in a process that is separate from the pre-fill / decoder stage. Deploying these two stages in independent vLLM instances brings three practical benefits:
+
+1. **Independent, fine-grained scaling**  
+2. **Lower time-to-first-token (TTFT)**  
+3. **Cross-process reuse and caching of encoder outputs**
+
+Design doc: <https://docs.google.com/document/d/1aed8KtC6XkXtdoV87pWT0a8OJlZ-CpnuLLzmR8l9BAE>
+
+---
+
+## 1  Motivation
+
+### 1. Independent, fine-grained scaling
+
+* Vision encoders are lightweight, while language models are orders of magnitude larger.  
+* The language model can be parallelised without affecting the encoder fleet.  
+* Encoder nodes can be added or removed independently.
+
+### 2. Lower time-to-first-token (TTFT)
+
+* Language-only requests bypass the vision encoder entirely.  
+* Encoder output is injected only at required attention layers, shortening the pre-fill critical path.
+
+### 3. Cross-process reuse and caching
+
+* In-process encoders confine reuse to a single worker.  
+* A remote, shared cache lets any worker retrieve existing embeddings, eliminating redundant computation.
+
+---
+
+## 2  Usage Example
+
+The current reference pathway is **SharedStorageConnector**.  
+Below ready-to-run scripts shows the workflow:
+
+1 Encoder instance + 1 PD instance:
+`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_encoder_example.sh`
+
+1 Encoder instance + 1 Prefill instance + 1 Decode instance:
+`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_epd_example.sh`
+
+---
+
+## 3  Test Script
+
+Please refer to the directories `tests/v1/ec_connector`
+
+## 4  Development
+
+Disaggregated encoding is implemented by running two parts:
+
+* **Encoder instance** – a vLLM instance to performs vision encoding.  
+* **Prefill/Decode (PD) instance(s)** – runs language pre-fill and decode.
+    * PD can be in either a single normal instance with `disagg_encoder_example.sh` (E->PD) or in disaggregated instances with `disagg_epd_example.sh` (E->P->D)
+
+A connector transfers encoder-cache (EC) embeddings from the encoder instance to the PD instance.  
+All related code is under `vllm/distributed/ec_transfer`.
+
+### Key abstractions
+
+* **ECConnector** – interface for retrieving EC caches produced by the encoder.  
+    * *Scheduler role* – checks cache existence and schedules loads.  
+    * *Worker role* – loads the embeddings into memory.
+
+Here is a figure illustrating disaggregate encoder flow:
+
+![Disaggregated Encoder Flow](../assets/features/disagg_encoder/disagg_encoder_flow.png)
+
+For the PD disaggregation part, the Prefill instance receive cache exactly the same as the disaggregate encoder flow above. Prefill instance executes 1 step (prefill -> 1 token output) and then transfer KV cache to the Decode instance for the remaining execution. The KV transfer part purely happens after the execute of the PDinstance.
+
+`docs/features/disagg_prefill.md` shows the brief idea about the disaggregated prefill (v0)
+
+We create the example setup with the **NixlConnector** from `vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py` and referred to the `tests/v1/kv_connector/nixl_integration/toy_proxy_server.py` to facilitate the kv transfer between P and D;
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index 3e8cb87e37d3..fd4f249f2ec6 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -91,6 +91,6 @@ Disaggregated prefilling is highly related to infrastructure, so vLLM relies on
 
 We recommend three ways of implementations:
 
-- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc.). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
 - **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
 - **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
diff --git a/docs/features/interleaved_thinking.md b/docs/features/interleaved_thinking.md
new file mode 100644
index 000000000000..7343324b4849
--- /dev/null
+++ b/docs/features/interleaved_thinking.md
@@ -0,0 +1,118 @@
+# Interleaved Thinking
+
+## Introduction
+
+Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results.
+
+Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature.
+
+## How Interleaved Thinking Works
+
+With interleaved thinking, the model can:
+
+- Reason about the results of a tool call before deciding what to do next
+- Chain multiple tool calls with reasoning steps in between
+- Make more nuanced decisions based on intermediate results
+- Provide transparent reasoning for its tool selection process
+
+## Supported Models
+
+vLLM currently supports the following interleaved thinking models:
+
+| Model Series | Reasoning Parser Name |
+|--------------|-----------------------|
+| moonshotai/Kimi-K2-Thinking    |  kimi_k2  |
+| MiniMaxAI/MiniMax-M2           |  minimax_m2  |
+
+## Example Usage
+
+To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example:
+
+??? code
+
+    ```python
+    """
+    vllm serve MiniMaxAI/MiniMax-M2 \
+      --tensor-parallel-size 4 \
+      --tool-call-parser minimax_m2 \
+      --reasoning-parser minimax_m2 \
+      --enable-auto-tool-choice
+    """
+    import json
+    
+    from openai import OpenAI
+    
+    client = OpenAI(base_url="http://localhost:8000/v1",     api_key="dummy")
+    
+    
+    def get_current_weather(location: str, unit: "str"):
+        """Get the current weather in a given location"""
+        if unit == "celsius":
+            return f"The current temperature in {location} is 22°C."
+        else:
+            return f"The current temperature in {location} is 72°F."
+    
+    
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given     location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and state, e.g.,     'San Francisco, CA'",
+                        },
+                        "unit": {"type": "string", "enum":     ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location", "unit"],
+                },
+            },
+        }
+    ]
+    messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}]
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    
+    tool_call = response.choices[0].message.tool_calls[0].function
+    
+    messages.append(
+        {
+            "role": "assistant",
+            "tool_calls": response.choices[0].message.tool_calls,
+            "reasoning": response.choices[0].message.reasoning, # append reasoning
+        }
+    )
+    
+    # Simulate tool execution
+    available_tools = {"get_weather": get_current_weather}
+    
+    completion_tool_calls = response.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        messages.append(
+            {
+                "role": "tool",
+                "content": result,
+                "tool_call_id": call.id,
+                "name": call.function.name,
+            }
+        )
+    response_2 = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    print(response_2.choices[0].message.content)
+    ```
+This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response.
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 3a85b52d89b6..d42a3cef76bd 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -4,7 +4,7 @@ This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09
 
 LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA].
 
-Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
+Adapters can be efficiently served on a per-request basis with minimal overhead. First we download the adapter(s) and save
 them locally with
 
 ```python
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 385e3bbb8712..bd7bc186e13a 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -281,4 +281,36 @@ python quantize_quark.py --model_dir Qwen/Qwen1.5-MoE-A2.7B-Chat \
     --group_size 32
 ```
 
-The current integration supports [all combination of FP4, FP6_E3M2, FP6_E2M3](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py) used for either weights or activations. Eventually, some target hardware support mixed precision GEMM, as AMD Instinct MI350/MI355, for example using FP6 for activations and FP4 for weights.
+The current integration supports [all combination of FP4, FP6_E3M2, FP6_E2M3](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py) used for either weights or activations.
+
+## Using Quark Quantized layerwise Auto Mixed Precision (AMP) Models
+
+vLLM also supports loading layerwise mixed precision model quantized using AMD Quark. Currently, mixed scheme of {MXFP4, FP8} is supported, where FP8 here denotes for FP8 per-tensor scheme. More mixed precision schemes are planned to be supported in a near future, including
+
+- Unquantized Linear and/or MoE layer(s) as an option for each layer, i.e., mixed of {MXFP4, FP8, BF16/FP16}
+- MXFP6 quantization extension, i.e., {MXFP4, MXFP6, FP8, BF16/FP16}
+
+Although one can maximize serving throughput using the lowest precision supported on a given device (e.g. MXFP4 for AMD Instinct MI355, FP8 for AMD Instinct MI300), these aggressive schemes can be detrimental to accuracy recovering from quantization on target tasks. Mixed precision allows to strike a balance between maximizing accuracy and throughput.
+
+There are two steps to generate and deploy a mixed precision model quantized with AMD Quark, as shown below.
+
+### 1. Quantize a model using mixed precision in AMD Quark
+
+Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
+
+As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are:
+
+- amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
+- amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
+- amd/Qwen3-8B-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
+
+### 2. inference the quantized mixed precision model in vLLM
+
+Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follow:
+
+```bash
+lm_eval --model vllm \
+    --model_args pretrained=amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8,tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.8,trust_remote_code=False \
+    --tasks mmlu \
+    --batch_size auto
+```
diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
index e7dd9fee12d3..9ab167ab9a23 100644
--- a/docs/features/sleep_mode.md
+++ b/docs/features/sleep_mode.md
@@ -11,7 +11,10 @@ Key benefits:
 - **Fine-grained control**: Optionally wake up only model weights or KV cache to avoid OOM during weight updates.
 
 !!! note
-    This feature is only supported on CUDA platform.
+    This feature is now supported on CUDA and ROCm platform.
+
+!!! note
+    For more information, see this [Blog Post](https://blog.vllm.ai/2025/10/26/sleep-mode.html).
 
 ## Sleep levels
 
@@ -31,6 +34,7 @@ llm = LLM("Qwen/Qwen3-0.6B", enable_sleep_mode=True)
 #### Python API
 
 ```python
+# Sleep level 1
 # Put the engine to sleep (level=1: offload weights to CPU RAM, discard KV cache)
 llm.sleep(level=1)
 
@@ -38,6 +42,21 @@ llm.sleep(level=1)
 llm.wake_up()
 ```
 
+```python
+# Sleep level 2
+# Put the engine to sleep (level=2: discard both weights and KV cache)
+llm.sleep(level=2)
+
+# Reallocate weights memory only
+llm.wake_up(tags=["weights"])
+
+# Load weights in-place
+llm.collective_rpc("reload_weights")
+
+# Reallocate KV cache
+llm.wake_up(tags=["kv_cache"])
+```
+
 #### RLHF weight updates
 
 During RLHF training, vLLM allows you to selectively wake up only the model weights or the KV cache using the tags argument in wake_up(). This fine-grained control is especially useful when updating model weights: by waking up just the weights (e.g., llm.wake_up(tags=["weights"])), you avoid allocating memory for the KV cache until after the weight update is complete. This approach helps prevent GPU out-of-memory (OOM) errors, particularly with large models, by minimizing peak memory usage during weight synchronization and update operations.
@@ -69,11 +88,35 @@ VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
   --port 8000
 ```
 
+Below is an example of how to sleep and wake up a model in level 1.
+
+```bash
+curl -X POST 'http://localhost:8000/sleep?level=1'
+curl -X POST 'http://localhost:8000/wake_up'
+```
+
+And this is an example of how to sleep and wake up a model in level 2.
+
+```bash
+curl -X POST 'http://localhost:8000/sleep?level=2'
+# Reallocate weights memory only
+curl -X POST 'http://localhost:8000/wake_up?tags=weights'
+# Load weights in-place
+curl -X POST 'http://localhost:8000/collective_rpc' -H 'Content-Type: application/json' -d '{"method":"reload_weights"}'
+# Reallocate KV cache
+curl -X POST 'http://localhost:8000/wake_up?tags=kv_cache'
+```
+
 #### HTTP endpoints
 
 - `POST /sleep?level=1` — Put the model to sleep (`level=1`).
 - `POST /wake_up` — Wake up the model. Supports optional `tags` query parameters for partial wake-up (e.g., `?tags=weights`).
+- `POST /collective_rpc` — Perform a collective remote procedure call (RPC).
 - `GET /is_sleeping` — Check if the model is sleeping.
 
 !!! note
     These endpoints are only available when passing `VLLM_SERVER_DEV_MODE=1`.
+
+## Limitation
+
+On ROCm, the virtual memory allocation on ROCm is done through chunked memory allocation. You can control the chunk size through `VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE` (in MB). The default value is set at 256MB. The larger the chunk size the faster the performance. However, setting it too large will cause OOM. So if you encounter OOM when using sleep mode. Try reducing the chunk size. It is recommended to define the chunk size as a power of 2.
diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md
index 7e2ed55008a5..4dc707d5f9a1 100644
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@@ -28,10 +28,15 @@ After installation of XCode and the Command Line Tools, which include Apple Clan
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-uv pip install -r requirements/cpu.txt
+uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
 uv pip install -e .
 ```
 
+!!! tip
+    The `--index-strategy unsafe-best-match` flag is needed to resolve dependencies across multiple package indexes (PyTorch CPU index and PyPI). Without this flag, you may encounter `typing-extensions` version conflicts.
+    
+    The term "unsafe" refers to the package resolution strategy, not security. By default, `uv` only searches the first index where a package is found to prevent dependency confusion attacks. This flag allows `uv` to search all configured indexes to find the best compatible versions. Since both PyTorch and PyPI are trusted package sources, using this strategy is safe and appropriate for vLLM installation.
+
 !!! note
     On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
 
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 2369eaed1802..be99cef3723e 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -93,7 +93,7 @@ Currently, there are no pre-built CPU wheels.
 
 ## Related runtime environment variables
 
-- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM to run more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists, `auto` (by default), or `nobind` (to disable binding to individual CPU cores and to inherit user-defined OpenMP variables). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. If set to `nobind`, the number of OpenMP threads is determined by the standard `OMP_NUM_THREADS` environment variable.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
 - `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
@@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels.
 
 ### Which `dtype` should be used?
 
-- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.  
+- Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.  
 
 ### How to launch a vLLM service on CPU?
 
@@ -128,7 +128,7 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
 
 ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
 
-- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to a same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If have any performance problems or unexpected binding behaviours, please try to bind threads as following.
+- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
 
 - On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
 
@@ -156,12 +156,12 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
     14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
     15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
 
-    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+    # On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
     $ export VLLM_CPU_OMP_THREADS_BIND=0-7
     $ python examples/offline_inference/basic/basic.py
     ```
 
-- When deploy vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on a same NUMA node to avoid cross NUMA node memory access.
+- When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
 
 ### How to decide `VLLM_CPU_KVCACHE_SPACE`?
 
@@ -171,7 +171,9 @@ This value is 4GB by default. Larger space can support more concurrent requests,
 
 First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`.
 
-Inference batch size is an important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
+Use multiples of 32 as `--block-size`, which is 128 by default.
+
+Inference batch size is an important parameter for the performance. A larger batch usually provides higher throughput, a smaller batch provides lower latency. Tuning the max batch size starting from the default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
 
 - `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as:
     - Offline Inference: `4096 * world_size`
@@ -192,8 +194,8 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
 ### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
 
 - Both of them require `amx` CPU flag.
-    - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
-    - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
+    - `VLLM_CPU_MOE_PREPACK` can provide better performance for MoE models
+    - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
 
 ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
 
diff --git a/docs/getting_started/installation/cpu.s390x.inc.md b/docs/getting_started/installation/cpu.s390x.inc.md
index 442c2b4ec64e..c2163139a7c5 100644
--- a/docs/getting_started/installation/cpu.s390x.inc.md
+++ b/docs/getting_started/installation/cpu.s390x.inc.md
@@ -2,7 +2,7 @@
 
 vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
 
-Currently the CPU implementation for s390x architecture supports FP32 datatype only.
+Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
 
 !!! warning
     There are no pre-built wheels or images for this device, so you must build vLLM from source.
diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
index 00f3b726b1a0..310f179cb89c 100644
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -83,7 +83,7 @@ uv pip install dist/*.whl
 !!! example "Troubleshooting"
     - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
     - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
-    - `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
+    - `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
     - If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
     ```toml title="pyproject.toml"
     [build-system]
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index ce1c5c53cf35..735074c08b8c 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import importlib
+import importlib.metadata
+import importlib.util
 import logging
 import sys
 import traceback
-from argparse import SUPPRESS, HelpFormatter
+from argparse import SUPPRESS, Action, HelpFormatter
+from collections.abc import Iterable
+from importlib.machinery import ModuleSpec
 from pathlib import Path
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 from unittest.mock import MagicMock, patch
 
 from pydantic_core import core_schema
@@ -19,6 +22,11 @@
 sys.path.insert(0, str(ROOT_DIR))
 
 
+def mock_if_no_torch(mock_module: str, mock: MagicMock):
+    if not importlib.util.find_spec("torch"):
+        sys.modules[mock_module] = mock
+
+
 # Mock custom op code
 class MockCustomOp:
     @staticmethod
@@ -29,18 +37,21 @@ def decorator(cls):
         return decorator
 
 
-noop = lambda *a, **k: None
-sys.modules["vllm._C"] = MagicMock()
-sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp)
-sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop)
+mock_if_no_torch("vllm._C", MagicMock())
+mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp))
+mock_if_no_torch(
+    "vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None)
+)
+
 
 # Mock any version checks by reading from compiled CI requirements
 with open(ROOT_DIR / "requirements/test.txt") as f:
     VERSIONS = dict(line.strip().split("==") for line in f if "==" in line)
 importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0"
 
+
 # Make torch.nn.Parameter safe to inherit from
-sys.modules["torch.nn"] = MagicMock(Parameter=object)
+mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
 
 
 class PydanticMagicMock(MagicMock):
@@ -49,31 +60,34 @@ class PydanticMagicMock(MagicMock):
     def __init__(self, *args, **kwargs):
         name = kwargs.pop("name", None)
         super().__init__(*args, **kwargs)
-        self.__spec__ = importlib.machinery.ModuleSpec(name, None)
+        self.__spec__ = ModuleSpec(name, None)
 
     def __get_pydantic_core_schema__(self, source_type, handler):
         return core_schema.any_schema()
 
 
-def auto_mock(module, attr, max_mocks=100):
+def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
     """Function that automatically mocks missing modules during imports."""
-    logger.info("Importing %s from %s", attr, module)
+    logger.info("Importing %s from %s", attr, module_name)
+
     for _ in range(max_mocks):
         try:
+            module = importlib.import_module(module_name)
+
             # First treat attr as an attr, then as a submodule
-            return getattr(
-                importlib.import_module(module),
-                attr,
-                importlib.import_module(f"{module}.{attr}"),
-            )
+            if hasattr(module, attr):
+                return getattr(module, attr)
+
+            return importlib.import_module(f"{module_name}.{attr}")
         except ModuleNotFoundError as e:
+            assert e.name is not None
             logger.info("Mocking %s for argparse doc generation", e.name)
             sys.modules[e.name] = PydanticMagicMock(name=e.name)
-        except Exception as e:
-            logger.warning("Failed to import %s.%s: %s", module, attr, e)
+        except Exception:
+            logger.exception("Failed to import %s.%s: %s", module_name, attr)
 
     raise ImportError(
-        f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
+        f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports"
     )
 
 
@@ -91,21 +105,26 @@ def auto_mock(module, attr, max_mocks=100):
 CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
 openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
 openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
-FlexibleArgumentParser = auto_mock(
-    "vllm.utils.argparse_utils", "FlexibleArgumentParser"
-)
+
+if TYPE_CHECKING:
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = auto_mock(
+        "vllm.utils.argparse_utils", "FlexibleArgumentParser"
+    )
 
 
 class MarkdownFormatter(HelpFormatter):
     """Custom formatter that generates markdown for argument groups."""
 
-    def __init__(self, prog, starting_heading_level=3):
-        super().__init__(prog, max_help_position=float("inf"), width=float("inf"))
+    def __init__(self, prog: str, starting_heading_level: int = 3):
+        super().__init__(prog, max_help_position=sys.maxsize, width=sys.maxsize)
+
         self._section_heading_prefix = "#" * starting_heading_level
         self._argument_heading_prefix = "#" * (starting_heading_level + 1)
         self._markdown_output = []
 
-    def start_section(self, heading):
+    def start_section(self, heading: str):
         if heading not in {"positional arguments", "options"}:
             heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
             self._markdown_output.append(heading_md)
@@ -113,14 +132,14 @@ def start_section(self, heading):
     def end_section(self):
         pass
 
-    def add_text(self, text):
+    def add_text(self, text: str):
         if text:
             self._markdown_output.append(f"{text.strip()}\n\n")
 
     def add_usage(self, usage, actions, groups, prefix=None):
         pass
 
-    def add_arguments(self, actions):
+    def add_arguments(self, actions: Iterable[Action]):
         for action in actions:
             if len(action.option_strings) == 0 or "--help" in action.option_strings:
                 continue
@@ -169,7 +188,7 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
         # Auto-mock runtime imports
         if tb_list := traceback.extract_tb(e.__traceback__):
             path = Path(tb_list[-1].filename).relative_to(ROOT_DIR)
-            auto_mock(module=".".join(path.parent.parts), attr=path.stem)
+            auto_mock(module_name=".".join(path.parent.parts), attr=path.stem)
             return create_parser(add_cli_args, **kwargs)
         else:
             raise e
@@ -209,7 +228,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
 
     # Generate documentation for each parser
     for stem, parser in parsers.items():
-        doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
+        doc_path = ARGPARSE_DOC_DIR / f"{stem}.inc.md"
         # Specify encoding for building on Windows
         with open(doc_path, "w", encoding="utf-8") as f:
             f.write(super(type(parser), parser).format_help())
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index eed1b3fb4bc8..bd14bbb9ab66 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -15,9 +15,9 @@ These models are what we list in [supported text models](#list-of-text-only-lang
 
 ### Transformers
 
-vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
+vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend".
 
-Currently, the Transformers backend works for the following:
+Currently, the Transformers modeling backend works for the following:
 
 - Modalities: embedding models, language models and vision-language models*
 - Architectures: encoder-only, decoder-only, mixture-of-experts
@@ -25,7 +25,7 @@ Currently, the Transformers backend works for the following:
 
 _*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
 
-If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
+If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM:
 
 - All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
 - Any combination of the following vLLM parallelisation schemes:
@@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model)))
 
 If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
 
-If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
+If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
 
 !!! note
     For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
@@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer
 
 If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
 
-For a model to be compatible with the Transformers backend for vLLM it must:
+For a model to be compatible with the Transformers modeling backend for vLLM it must:
 
 - be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
     - The model directory must have the correct structure (e.g. `config.json` is present).
     - `config.json` must contain `auto_map.AutoModel`.
-- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
+- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
     - Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
 
 If the compatible model is:
@@ -66,16 +66,21 @@ If the compatible model is:
 - on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 - in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 
-This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
+This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
 
 #### Writing custom models
 
-This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
+This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
 
-To make your model compatible with the Transformers backend, it needs:
+To make your model compatible with the Transformers modeling backend, it needs:
 
 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
-    1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`.
+    - If your model is encoder-only:
+        1. Add `is_causal = False` to `MyAttention`.
+    - If your model is mixture-of-experts (MoE):
+        1. Your sparse MoE block must have an attribute called `experts`.
+        2. The class of `experts` (`MyExperts`) must inherit from `nn.ModuleList`.
+        3. `MyExperts.forward` must accept `hidden_states`, `top_k_index`, `top_k_weights`.
 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
 3. `MyModel` must contain `_supports_attention_backend = True`.
 
@@ -102,6 +107,23 @@ class MyAttention(nn.Module):
         )
         ...
 
+# Only do this for mixture-of-experts models
+class MyExperts(nn.ModuleList):
+    def forward(self, hidden_states, top_k_index, top_k_weights):
+        ...
+
+# Only do this for mixture-of-experts models
+class MySparseMoEBlock(nn.Module):
+    def __init__(self, config):
+        ...
+        self.experts = MyExperts(config)
+        ...
+
+    def forward(self, hidden_states: torch.Tensor):
+        ...
+        hidden_states = self.experts(hidden_states, top_k_index, top_k_weights)
+        ...
+
 class MyModel(PreTrainedModel):
     _supports_attention_backend = True
 ```
@@ -112,7 +134,7 @@ Here is what happens in the background when this model is loaded:
 
 1. The config is loaded.
 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
-3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
 
 That's it!
 
@@ -160,7 +182,7 @@ To determine whether a given model is natively supported, you can check the `con
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
 
 Models do not _need_ to be natively supported to be used in vLLM.
-The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
 
 !!! tip
     The easiest way to check if your model is really supported at runtime is to run the program below:
@@ -329,6 +351,7 @@ th {
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
+| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
 | `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
 | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
@@ -429,7 +452,7 @@ th {
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
 | `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
 
-Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
@@ -647,7 +670,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
 | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
-| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
+| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>E+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
 | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
@@ -662,7 +685,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
 | `LightOnOCRForConditionalGeneration`  | LightOnOCR-1B  | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
-| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
+| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
 | `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
 | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |
@@ -698,7 +721,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
 | `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
 
-Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
@@ -763,6 +786,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
 | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
 
+!!! note
+    `VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
+
 ### Pooling Models
 
 See [this page](./pooling_models.md) for more information on how to use pooling models.
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index e331b3422ea6..821628e6e317 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -77,11 +77,11 @@ In addition, we have the following custom APIs:
 
 In order for the language model to support chat protocol, vLLM requires the model to include
 a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
-specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
+specifies how roles, messages, and other chat-specific tokens are encoded in the input.
 
 An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
 
-Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those models,
 you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
 template, or the template in string form. Without a chat template, the server will not be able to process chat
 and all chat requests will error.
diff --git a/docs/usage/README.md b/docs/usage/README.md
index 0c63d01f0f99..4e8ece2c0605 100644
--- a/docs/usage/README.md
+++ b/docs/usage/README.md
@@ -1,6 +1,6 @@
 # Using vLLM
 
-First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
+First, vLLM must be [installed](../getting_started/installation/README.md) for your chosen device in either a Python or Docker environment.
 
 Then, vLLM supports the following usage patterns:
 
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 53d69bbdbdc7..04e6f99f8957 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -43,6 +43,7 @@ class ModelRequestData(NamedTuple):
 
 
 # Voxtral
+# Make sure to install mistral-common[audio].
 def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
     from mistral_common.audio import Audio
     from mistral_common.protocol.instruct.chunk import (
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index d5b1b4ad29a9..6a533eb5c937 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -4,8 +4,7 @@
 This file demonstrates the usage of text generation with an LLM model,
 comparing the performance with and without speculative decoding.
 
-Note that still not support `v1`:
-VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
+Note that this example is out of date and not supported in vLLM v1.
 """
 
 import gc
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
index 16d44cbadbc9..d8fb50d7fe55 100644
--- a/examples/offline_inference/qwen2_5_omni/README.md
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
 
 # Read vision and audio inputs from a single video file
 # NOTE: V1 engine does not support interleaved modalities yet.
-VLLM_USE_V1=0 \
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q use_audio_in_video
 
 # Multiple audios
-VLLM_USE_V1=0 \
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q multi_audios
 ```
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
index 6fbe1303f431..ed005e6a69b8 100644
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -7,7 +7,6 @@
 
 from typing import NamedTuple
 
-import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
     )
     asset = VideoAsset(name="baby_reading", num_frames=16)
     audio = asset.get_audio(sampling_rate=16000)
-    assert not envs.VLLM_USE_V1, (
-        "V1 does not support use_audio_in_video. "
-        "Please launch this example with "
-        "`VLLM_USE_V1=0`."
-    )
+
     return QueryResult(
         inputs={
             "prompt": prompt,
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 371cf6309a67..624de2a2debc 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1536,7 +1536,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
         mm_processor_kwargs={
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
-            "fps": [1],
+            "fps": 1,
         },
         limit_mm_per_prompt={modality: 1},
     )
diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md
new file mode 100644
index 000000000000..5813a3cecf73
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -0,0 +1,119 @@
+# Disaggregated Encoder
+
+These example scripts that demonstrate the disaggregated encoder (EPD) features of vLLM.
+
+For a detailed explanation of the EPD features, please refer to the [Disaggregated Encoder Feature Documentation](../../../docs/features/disagg_encoder.md).
+
+## Files
+
+- `disagg_epd_proxy.py` - Proxy script that demonstrates the XeYpZd setup (X encode instances, Y prefill instances, Z decode instances). Currently stable for the 1e1p1d configuration.
+
+- `disagg_1e1p1d_example.sh` - Sets up the 1e1p1d configuration, runs the VisionArena benchmark, and processes a single request with a local image.
+
+- `disagg_1e1pd_example.sh` - Sets up the 1e1pd configuration, runs the VisionArena benchmark, and processes a single request with a local image.
+
+### Custom Configuration
+
+```bash
+# Use specific GPUs
+GPU_E=0 GPU_PD=1 GPU_P=1 GPU_D=2 bash disagg_1e1p1d_example.sh
+
+# Use specific ports
+ENDPOINT_PORT=10001 bash disagg_1e1p1d_example.sh
+
+# Use specific model
+MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh
+
+# Use specific storage path
+EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh
+```
+
+## Encoder Instances
+
+Encoder engines should be launched with the following flags:
+
+- `--enforce-eager` **(required)** – The current EPD implementation is only compatible with encoder instances running in this mode.
+
+- `--no-enable-prefix-caching` **(required)** – Encoder instances do not consume KV cache; prefix caching is disabled to avoid conflicts with other features.
+
+- `--max-num-batched-tokens=<large value>` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
+
+## Local media inputs
+
+To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
+
+```bash
+--allowed-local-media-path $MEDIA_PATH
+```
+
+The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url": "file://'"$MEDIA_PATH_FILENAME"'}``` as multimodal inputs. Each URI is passed unchanged from the `disagg_encoder_proxy` to the encoder instance so that the encoder can load the media locally.
+
+## EC connector and KV transfer
+
+The `ECSharedStorageConnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration:
+
+```bash
+# Add to encoder instance: 
+--ec-transfer-config '{
+    "ec_connector": "ECSharedStorageConnector",
+    "ec_role": "ec_producer",
+    "ec_connector_extra_config": {
+        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+    }
+}' 
+
+# Add to prefill/prefill+decode instance: 
+--ec-transfer-config '{
+    "ec_connector": "ECSharedStorageConnector",
+    "ec_role": "ec_consumer",
+    "ec_connector_extra_config": {
+        "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+    }
+}' 
+```
+
+`$EC_SHARED_STORAGE_PATH` is the path where the EC connector temporarily stores the cache.
+
+If you enable prefill instance (`--prefill-servers-urls` not disabled), you will need --kv-transfer-config to facilitate the PD disaggregation. Currently, we use the `NixlConnector` for this purpose. Refer to `tests/v1/kv_connector/nixl_integration` for more example codes on PD disaggregation with Nixl.
+
+```bash
+# Add to prefill instance:    
+--kv-transfer-config '{
+    "kv_connector": "NixlConnector",
+    "kv_role": "kv_producer"
+}' 
+
+# Add to decode instance:
+--kv-transfer-config '{
+    "kv_connector": "NixlConnector",
+    "kv_role": "kv_consumer"
+}' 
+```
+
+## Proxy Instance Flags (`disagg_epd_proxy.py`)
+
+| Flag | Description |
+|------|-------------|
+| `--encode-servers-urls` | Comma-separated list of encoder endpoints. Every multimodal item extracted from the request is fanned out to one of these URLs in a round-robin fashion. |
+| `--prefill-servers-urls` | Comma-separated list of prefill endpoints. Set to `disable`, `none`, or `""` to skip the dedicated prefill phase and run E+PD (encoder + combined prefill/decode). |
+| `--decode-servers-urls` | Comma-separated list of decode endpoints. Non-stream and stream paths both round-robin over this list. |
+| `--host`, `--port` | Bind address for the proxy itself (defaults: `0.0.0.0:8000`). |
+
+Example usage:
+For E + PD setup:
+
+```bash
+$ python disagg_encoder_proxy.py \
+      --encode-servers-urls "http://e1:8001,http://e2:8002" \
+      --prefill-servers-urls "disable" \
+      --decode-servers-urls "http://pd1:8003,http://pd2:8004"
+```
+
+For E + P + D setup:
+
+```bash
+$ python disagg_encoder_proxy.py \
+      --encode-servers-urls "http://e1:8001,http://e2:8001" \
+      --prefill-servers-urls "http://p1:8003,http://p2:8004" \ 
+      --decode-servers-urls "http://d1:8005,http://d2:8006"
+```
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
new file mode 100644
index 000000000000..57489df64f51
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+set -euo pipefail
+
+declare -a PIDS=()
+
+###############################################################################
+# Configuration -- override via env before running
+###############################################################################
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+LOG_PATH="${LOG_PATH:-./logs}"
+mkdir -p $LOG_PATH
+
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_PORT="${PREFILL_PORT:-19535}"
+DECODE_PORT="${DECODE_PORT:-19536}"
+PROXY_PORT="${PROXY_PORT:-10001}"
+
+GPU_E="${GPU_E:-2}"
+GPU_P="${GPU_P:-2}"
+GPU_D="${GPU_D:-3}"
+
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout
+
+NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark
+
+export UCX_TLS=all
+export UCX_NET_DEVICES=all
+
+###############################################################################
+# Helpers
+###############################################################################
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+START_TIME=$(date +"%Y%m%d_%H%M%S")
+ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
+P_LOG=$LOG_PATH/p_${START_TIME}.log
+D_LOG=$LOG_PATH/d_${START_TIME}.log
+PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
+
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:$port/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM USR1   # prevent re-entrancy
+    
+    # Kill all tracked PIDs
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process $pid"
+            kill "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Wait a moment for graceful shutdown
+    sleep 2
+    
+    # Force kill any remaining processes
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process $pid"
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Kill the entire process group as backup
+    kill -- -$$ 2>/dev/null
+    
+    echo "All processes stopped."
+    exit 0
+}
+
+trap cleanup INT
+trap cleanup USR1
+trap cleanup TERM
+
+# clear previous cache
+echo "remove previous ec cache folder"
+rm -rf $EC_SHARED_STORAGE_PATH
+
+echo "make ec cache folder"
+mkdir -p $EC_SHARED_STORAGE_PATH
+
+###############################################################################
+# Encoder worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.01 \
+    --port "$ENCODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --no-enable-prefix-caching \
+    --max-num-batched-tokens 114688 \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_producer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${ENC_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Prefill worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_P" \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$PREFILL_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_consumer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    --kv-transfer-config '{
+        "kv_connector": "NixlConnector",
+        "kv_role": "kv_producer"
+    }' \
+    >"${P_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Decode worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_D" \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$DECODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --kv-transfer-config '{
+        "kv_connector": "NixlConnector",
+        "kv_role": "kv_consumer"
+    }' \
+    >"${D_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+# Wait for workers
+wait_for_server $ENCODE_PORT
+wait_for_server $PREFILL_PORT
+wait_for_server $DECODE_PORT
+
+###############################################################################
+# Proxy
+###############################################################################
+python disagg_epd_proxy.py \
+    --host "0.0.0.0" \
+    --port "$PROXY_PORT" \
+    --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+    --prefill-servers-urls "http://localhost:$PREFILL_PORT" \
+    --decode-servers-urls "http://localhost:$DECODE_PORT" \
+    >"${PROXY_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+wait_for_server $PROXY_PORT
+echo "All services are up!"
+
+###############################################################################
+# Benchmark
+###############################################################################
+echo "Running benchmark (stream)..."
+vllm bench serve \
+  --model               $MODEL \
+  --backend             openai-chat \
+  --endpoint            /v1/chat/completions \
+  --dataset-name        hf \
+  --dataset-path        lmarena-ai/VisionArena-Chat \
+  --seed                0 \
+  --num-prompts         $NUM_PROMPTS \
+  --port                $PROXY_PORT
+
+PIDS+=($!)
+
+###############################################################################
+# Single request with local image
+###############################################################################
+echo "Running single request with local image (non-stream)..."
+curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "model": "'${MODEL}'",
+    "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
+        {"type": "text", "text": "What is in this image?"}
+    ]}
+    ]
+    }'
+
+
+# cleanup
+echo "cleanup..."
+cleanup
\ No newline at end of file
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
new file mode 100644
index 000000000000..6073e0580b11
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+set -euo pipefail
+
+declare -a PIDS=()
+
+###############################################################################
+# Configuration -- override via env before running
+###############################################################################
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+LOG_PATH="${LOG_PATH:-./logs}"
+mkdir -p $LOG_PATH
+
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
+PROXY_PORT="${PROXY_PORT:-10001}"
+
+GPU_E="${GPU_E:-0}"
+GPU_PD="${GPU_PD:-1}"
+
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout
+
+NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark
+
+###############################################################################
+# Helpers
+###############################################################################
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+START_TIME=$(date +"%Y%m%d_%H%M%S")
+ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
+PD_LOG=$LOG_PATH/pd_${START_TIME}.log
+PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log
+
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:$port/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM USR1   # prevent re-entrancy
+    
+    # Kill all tracked PIDs
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process $pid"
+            kill "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Wait a moment for graceful shutdown
+    sleep 2
+    
+    # Force kill any remaining processes
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process $pid"
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Kill the entire process group as backup
+    kill -- -$$ 2>/dev/null
+    
+    echo "All processes stopped."
+    exit 0
+}
+
+trap cleanup INT
+trap cleanup USR1
+trap cleanup TERM
+
+# clear previous cache
+echo "remove previous ec cache folder"
+rm -rf $EC_SHARED_STORAGE_PATH
+
+echo "make ec cache folder"
+mkdir -p $EC_SHARED_STORAGE_PATH
+
+###############################################################################
+# Encoder worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.01 \
+    --port "$ENCODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --no-enable-prefix-caching \
+    --max-num-batched-tokens 114688 \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_producer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${ENC_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+###############################################################################
+# Prefill+Decode worker
+###############################################################################
+CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
+    --gpu-memory-utilization 0.7 \
+    --port "$PREFILL_DECODE_PORT" \
+    --enforce-eager \
+    --enable-request-id-headers \
+    --max-num-seqs 128 \
+    --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+    --ec-transfer-config '{
+        "ec_connector": "ECSharedStorageConnector",
+        "ec_role": "ec_consumer",
+        "ec_connector_extra_config": {
+            "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+        }
+    }' \
+    >"${PD_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+# Wait for workers
+wait_for_server $ENCODE_PORT
+wait_for_server $PREFILL_DECODE_PORT
+
+###############################################################################
+# Proxy
+###############################################################################
+python disagg_epd_proxy.py \
+    --host "0.0.0.0" \
+    --port "$PROXY_PORT" \
+    --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+    --prefill-servers-urls "disable" \
+    --decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
+    >"${PROXY_LOG}" 2>&1 &
+
+PIDS+=($!)
+
+wait_for_server $PROXY_PORT
+echo "All services are up!"
+
+###############################################################################
+# Benchmark
+###############################################################################
+echo "Running benchmark (stream)..."
+vllm bench serve \
+  --model               $MODEL \
+  --backend             openai-chat \
+  --endpoint            /v1/chat/completions \
+  --dataset-name        hf \
+  --dataset-path        lmarena-ai/VisionArena-Chat \
+  --seed                0 \
+  --num-prompts         $NUM_PROMPTS \
+  --port                $PROXY_PORT
+
+PIDS+=($!)
+
+###############################################################################
+# Single request with local image
+###############################################################################
+echo "Running single request with local image (non-stream)..."
+curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "model": "'${MODEL}'",
+    "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
+        {"type": "text", "text": "What is in this image?"}
+    ]}
+    ]
+    }'
+
+
+# cleanup
+echo "cleanup..."
+cleanup
\ No newline at end of file
diff --git a/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py b/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
new file mode 100644
index 000000000000..b5f99683c2bf
--- /dev/null
+++ b/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
@@ -0,0 +1,606 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+disagg_encoder_proxy.py
+
+Proxy that routes OpenAI-compatible “/v1/chat/completions” requests to two
+clusters:
+  • encode  (multimodal feature extraction)
+  • decode  (language-model inference)
+
+For MM input we:
+    1. Extract *every* image/audio item.
+    2. Fire N concurrent requests to the encoder cluster
+       (one request per item, with **all text removed**).
+    3. Wait for all of them to succeed.
+    4. Forward the *original* request to a decode server.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+import random
+import uuid
+from collections.abc import AsyncIterator
+
+import aiohttp
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+###############################################################################
+# FastAPI app & global state
+###############################################################################
+
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s %(levelname)s: %(message)s"
+)
+logger = logging.getLogger("proxy")
+
+app = FastAPI()
+encode_session: aiohttp.ClientSession | None = None
+prefill_session: aiohttp.ClientSession | None = None
+decode_session: aiohttp.ClientSession | None = None
+
+###############################################################################
+# Utils
+###############################################################################
+
+
+MM_TYPES = {"image_url", "audio_url", "input_audio"}
+
+
+def extract_mm_items(request_data: dict) -> list[dict]:
+    """
+    Return *all* image/audio items that appear anywhere in `messages`.
+
+    Each returned dict looks like:
+        { "type": "image_url", "image_url": {...} }
+    """
+    items: list[dict] = []
+    for msg in request_data.get("messages", []):
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+
+        for item in content:
+            if item.get("type") in MM_TYPES:
+                items.append(item)
+    return items
+
+
+async def fanout_encoder_primer(
+    orig_request: dict,
+    e_urls: list[str],
+    req_id: str,
+) -> None:
+    """
+    1. Build one request *per MM item* with all text removed.
+    2. Send them concurrently to the encode cluster.
+    3. Raise if any of them fails.
+    """
+    logger.info("[%s] Processing multimodal items...", req_id)
+
+    mm_items = extract_mm_items(orig_request)
+    if not mm_items:
+        logger.info("[%s] No multimodal items, skipping encoder", req_id)
+        return  # nothing to do
+
+    logger.info("[%s] got %d multimodal items...", req_id, len(mm_items))
+
+    tasks = []
+
+    # Round-robin over encode servers to distribute load a bit
+    url_cycle = (e_urls[i % len(e_urls)] for i in range(len(mm_items)))
+
+    for idx, (item, target_url) in enumerate(zip(mm_items, url_cycle)):
+        # Derive a *child* request id:  <parent>:<index>:<random-short>
+        child_req_id = f"{req_id}:{idx}:{uuid.uuid4().hex[:6]}"
+        headers = {"x-request-id": child_req_id}
+
+        encoder_req = {
+            # You *may* need to keep additional fields
+            "model": orig_request.get("model"),
+            "messages": [
+                {"role": "user", "content": [item]},
+            ],
+            # Only need 1 token so the server actually runs the encoder path
+            "max_tokens": 1,
+            "stream": False,
+        }
+        tasks.append(
+            encode_session.post(
+                f"{target_url}/v1/chat/completions",
+                json=encoder_req,
+                headers=headers,
+            )
+        )
+
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    # Fail fast if any sub-request failed
+    for idx, r in enumerate(results):
+        if isinstance(r, Exception):
+            logger.error(
+                "[%s] Encoder request #%d raised exception: %s",
+                req_id,
+                idx,
+                r,
+                exc_info=r,
+            )
+            raise HTTPException(
+                status_code=502, detail=f"Encoder request failed: {str(r)}"
+            )
+        if r.status != 200:
+            try:
+                detail = await r.text()
+            except Exception:
+                detail = "<unable to read body>"
+            logger.error(
+                "[%s] Encoder request #%d returned status %s: %s",
+                req_id,
+                idx,
+                r.status,
+                detail,
+            )
+            raise HTTPException(
+                status_code=r.status,
+                detail=f"Encoder request failed: {detail}",
+            )
+
+    logger.info(
+        "[%s] All %d encoder requests completed successfully", req_id, len(mm_items)
+    )
+
+
+async def maybe_prefill(
+    req_data: dict,
+    p_url: str,
+    req_id: str,
+) -> dict:
+    """
+    - Do prefill-only task if p_url exist;
+    - Return modified request data with kv transfer params (for nixl connector)
+    - Else, skip and return the original request data for decode
+    """
+    if p_url:
+        logger.info("[%s] Processing through prefill: %s", req_id, p_url)
+
+        prefill_response = await process_prefill_stage(req_data, p_url, req_id)
+        # for nixl connector to facilitate kv transfer...
+        prefill_response_json = await prefill_response.json()
+        kv_transfer_params = prefill_response_json.get("kv_transfer_params", {})
+        if kv_transfer_params:
+            req_data["kv_transfer_params"] = kv_transfer_params
+
+        return req_data
+    else:
+        return req_data
+
+
+async def process_prefill_stage(
+    req_data: dict,
+    p_url: str,
+    req_id: str,
+) -> dict:
+    """Process request through Prefill stage and return kv_transfer_params"""
+    logger.info("[%s] Sending prefill request to: %s", req_id, p_url)
+
+    prefill_request = req_data.copy()
+    prefill_request["kv_transfer_params"] = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "remote_engine_id": None,
+        "remote_block_ids": None,
+        "remote_host": None,
+        "remote_port": None,
+    }
+    prefill_request["stream"] = False
+    prefill_request["max_tokens"] = 1
+    if "max_completion_tokens" in prefill_request:
+        prefill_request["max_completion_tokens"] = 1
+    if "stream_options" in prefill_request:
+        del prefill_request["stream_options"]
+
+    headers = {"x-request-id": req_id}
+    try:
+        prefill_response = await prefill_session.post(
+            f"{p_url}/v1/chat/completions", json=prefill_request, headers=headers
+        )
+        prefill_response.raise_for_status()
+
+        if prefill_response.status != 200:
+            error_text = await prefill_response.text()
+            logger.error(
+                "[%s] Prefill request failed with status %d: %s",
+                req_id,
+                prefill_response.status,
+                error_text,
+            )
+            raise HTTPException(
+                status_code=prefill_response.status,
+                detail={"error": "Prefill request failed", "message": error_text},
+            )
+        logger.info("[%s] Prefill request completed successfully", req_id)
+
+        return prefill_response
+
+    except Exception as e:
+        logger.error("Prefill processing failed: %s", str(e))
+        raise HTTPException(
+            status_code=500,
+            detail={"error": "Prefill processing error", "message": str(e)},
+        ) from e
+
+
+###############################################################################
+# Middleware for request/response logging
+###############################################################################
+
+
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+    """Middleware to log all incoming requests and responses"""
+    req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
+
+    # Log incoming request
+    logger.info(
+        ">>> [%s] %s %s from %s",
+        req_id,
+        request.method,
+        request.url.path,
+        request.client.host if request.client else "unknown",
+    )
+
+    try:
+        # Process request
+        response = await call_next(request)
+
+        # Log response
+        logger.info(
+            "<<< [%s] %s %s completed with status %d",
+            req_id,
+            request.method,
+            request.url.path,
+            response.status_code,
+        )
+
+        return response
+    except Exception as e:
+        # Log errors
+        logger.exception(
+            "!!! [%s] %s %s failed with error: %s",
+            req_id,
+            request.method,
+            request.url.path,
+            str(e),
+        )
+        raise
+
+
+###############################################################################
+# FastAPI lifecycle
+###############################################################################
+
+
+@app.on_event("startup")
+async def on_startup() -> None:
+    global encode_session, prefill_session, decode_session
+    timeout = aiohttp.ClientTimeout(total=100_000)
+    connector = aiohttp.TCPConnector(limit=0, force_close=False)
+    encode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+    if app.state.p_urls:
+        # only setup if prefill instance(s) exist
+        prefill_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+    decode_session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+
+
+@app.on_event("shutdown")
+async def on_shutdown() -> None:
+    global encode_session, prefill_session, decode_session
+    if encode_session:
+        await encode_session.close()
+    if prefill_session:
+        await prefill_session.close()
+    if decode_session:
+        await decode_session.close()
+
+
+###############################################################################
+# Core forwarding
+###############################################################################
+
+
+async def forward_non_stream(
+    req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
+) -> dict:
+    try:
+        # Step 1: Process through Encoder instance (if has MM input)
+        await fanout_encoder_primer(req_data, e_urls, req_id)
+
+        # Step 2: Process through Prefill instance
+        req_data = await maybe_prefill(req_data, p_url, req_id)
+
+        # Step 3: Process through Decode instance
+        logger.info("[%s] Forwarding to decode: %s", req_id, d_url)
+        headers = {"x-request-id": req_id}
+
+        # Non-streaming response
+        async with decode_session.post(
+            f"{d_url}/v1/chat/completions", json=req_data, headers=headers
+        ) as resp:
+            resp.raise_for_status()
+            return await resp.json()
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("[%s] Error in forward_non_stream: %s", req_id, str(e))
+        raise HTTPException(status_code=500, detail=f"Proxy error: {str(e)}") from e
+
+
+async def forward_stream(
+    req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str
+) -> AsyncIterator[str]:
+    try:
+        # Step 1: Process through Encoder instance (if has MM input)
+        await fanout_encoder_primer(req_data, e_urls, req_id)
+
+        # Step 2: Process through Prefill instance
+        req_data = await maybe_prefill(req_data, p_url, req_id)
+
+        # Step 3: Process through Decode instance
+        logger.info("[%s] Starting streaming from decode: %s", req_id, d_url)
+        headers = {"x-request-id": req_id}
+
+        # Streaming response
+        async with decode_session.post(
+            f"{d_url}/v1/chat/completions",
+            json=req_data,
+            headers=headers,
+        ) as resp:
+            resp.raise_for_status()
+            async for chunk in resp.content.iter_chunked(1024):
+                if chunk:
+                    yield chunk.decode("utf-8", errors="ignore")
+
+        logger.info("[%s] Streaming completed", req_id)
+
+    except HTTPException:
+        logger.exception("[%s] HTTPException in forward_stream", req_id)
+        raise
+    except Exception as e:
+        logger.exception("[%s] Error in forward_stream: %s", req_id, str(e))
+        raise HTTPException(
+            status_code=500, detail=f"Proxy streaming error: {str(e)}"
+        ) from e
+
+
+###############################################################################
+# Public routes
+###############################################################################
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    try:
+        req_data = await request.json()
+        req_id = request.headers.get("x-request-id", str(uuid.uuid4()))
+
+        e_urls = app.state.e_urls  # we want the full list for fan-out
+        p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+        d_url = random.choice(app.state.d_urls)
+
+        is_streaming = req_data.get("stream", False)
+
+        if is_streaming:
+            return StreamingResponse(
+                forward_stream(req_data, req_id, e_urls, p_url, d_url),
+                media_type="text/event-stream",
+            )
+        result = await forward_non_stream(req_data, req_id, e_urls, p_url, d_url)
+        return JSONResponse(content=result)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Error in chat_completions endpoint: %s", str(e))
+        raise HTTPException(
+            status_code=500, detail=f"Request processing error: {str(e)}"
+        ) from e
+
+
+@app.get("/v1/models")
+async def list_models():
+    async with decode_session.get(f"{app.state.d_urls[0]}/v1/models") as resp:
+        resp.raise_for_status()
+        return await resp.json()
+
+
+@app.get("/health")
+async def health_check():
+    async def healthy(urls):
+        if not urls:
+            return "empty"
+        for u in urls:
+            try:
+                async with encode_session.get(f"{u}/health") as resp:
+                    resp.raise_for_status()
+            except Exception:
+                return "unhealthy"
+        return "healthy"
+
+    e_status, p_status, d_status = await asyncio.gather(
+        healthy(app.state.e_urls), healthy(app.state.p_urls), healthy(app.state.d_urls)
+    )
+
+    overall_healthy = all(
+        status != "unhealthy" for status in (e_status, p_status, d_status)
+    )
+
+    status_code = 200 if overall_healthy else 503
+
+    return JSONResponse(
+        {
+            "proxy": "healthy",
+            "encode_cluster": e_status,
+            "prefill_cluster": p_status,
+            "decode_cluster": d_status,
+        },
+        status_code=status_code,
+    )
+
+
+###############################################################################
+# Simple profiler fan-out (unchanged except for sessions)
+###############################################################################
+
+
+async def _post_if_available(
+    session: aiohttp.ClientSession,
+    url: str,
+    payload: dict,
+    headers: dict,
+) -> dict | None:
+    """
+    POST `payload` to `url`.
+
+    Returns
+    -------
+    • The decoded JSON body on success (2xx)
+    • None if the endpoint does not exist (404)
+    • Raises for anything else.
+    """
+    try:
+        resp = await session.post(url, json=payload, headers=headers)
+        if resp.status == 404:  # profiling disabled on that server
+            logger.warning("Profiling endpoint missing on %s", url)
+            return None
+        resp.raise_for_status()
+        return await resp.json(content_type=None)
+    except aiohttp.ClientResponseError as exc:
+        # Pass 404 through the branch above, re-raise everything else
+        if exc.status == 404:
+            logger.warning("Profiling endpoint missing on %s", url)
+            return None
+        raise
+    except Exception:
+        # Network errors etc.: propagate
+        raise
+
+
+async def _profile_cmd(cmd: str, payload: dict, e_url: str, p_url: str, d_url: str):
+    """
+    Fire & forget to both clusters, tolerate 404.
+    """
+    headers = {"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY', '')}"}
+
+    encode_task = _post_if_available(
+        encode_session, f"{e_url}/{cmd}_profile", payload, headers
+    )
+    prefill_task = (
+        _post_if_available(prefill_session, f"{p_url}/{cmd}_profile", payload, headers)
+        if p_url is not None
+        else asyncio.sleep(0)
+    )
+    decode_task = _post_if_available(
+        decode_session, f"{d_url}/{cmd}_profile", payload, headers
+    )
+
+    encode_res, prefill_res, decode_res = await asyncio.gather(
+        encode_task, prefill_task, decode_task
+    )
+
+    # If *all* clusters said “I don’t have that route”, surface an error
+    if encode_res is prefill_res is decode_res is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Profiling endpoints are disabled on all clusters",
+        )
+
+    return {
+        "encode": encode_res,  # may be None
+        "prefill": prefill_res,  # may be None
+        "decode": decode_res,  # may be None
+    }
+
+
+@app.post("/start_profile")
+async def start_profile(request: Request):
+    body = await request.json()
+    # TODO: handle multi urls properly
+    e_url = random.choice(app.state.e_urls)
+    p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+    d_url = random.choice(app.state.d_urls)
+    return await _profile_cmd("start", body, e_url, p_url, d_url)
+
+
+@app.post("/stop_profile")
+async def stop_profile(request: Request):
+    body = await request.json()
+    # TODO: handle multi urls properly
+    e_url = random.choice(app.state.e_urls)
+    p_url = random.choice(app.state.p_urls) if app.state.p_urls else None
+    d_url = random.choice(app.state.d_urls)
+    return await _profile_cmd("stop", body, e_url, p_url, d_url)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--encode-servers-urls",
+        required=True,
+        help='Comma-separated encode URLs ("http://e1:8001,http://e2:8001")',
+    )
+    parser.add_argument(
+        "--prefill-servers-urls",
+        required=True,
+        help=(
+            'Comma-separated prefill URLs ("http://p1:8003,http://p2:8004") ',
+            'to enable E->P->D, set "disable" or "none" to enable E->PD',
+        ),
+    )
+    parser.add_argument(
+        "--decode-servers-urls",
+        required=True,
+        help='Comma-separated decode URLs ("http://d1:8005,http://d2:8006")',
+    )
+
+    args = parser.parse_args()
+    app.state.e_urls = [
+        u.strip() for u in args.encode_servers_urls.split(",") if u.strip()
+    ]
+    app.state.d_urls = [
+        u.strip() for u in args.decode_servers_urls.split(",") if u.strip()
+    ]
+    # handle prefill instances
+    if args.prefill_servers_urls.lower() in ("disable", "none", ""):
+        app.state.p_urls = []
+        logger.info(
+            "Disaggregated prefill phase explicitly disabled by user. Running E + PD..."
+        )
+    else:
+        app.state.p_urls = [
+            u.strip() for u in args.prefill_servers_urls.split(",") if u.strip()
+        ]
+        logger.info("Disaggregated prefill phase is enabled. Running E + P + D...")
+
+    logger.info("Proxy listening on %s:%s", args.host, args.port)
+    logger.info("Encode servers: %s", app.state.e_urls)
+    logger.info("Prefill instances %s", app.state.p_urls)
+    logger.info("Decode servers: %s", app.state.d_urls)
+
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="info",
+        loop="uvloop",
+        access_log=True,
+    )
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 41dbb3236297..0bd1d05322f8 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -161,6 +161,7 @@ def main():
         {
             "role": "assistant",
             "tool_calls": chat_completion.choices[0].message.tool_calls,
+            "reasoning": chat_completion.choices[0].message.reasoning,
         }
     )
 
diff --git a/examples/online_serving/token_generation_client.py b/examples/online_serving/token_generation_client.py
new file mode 100644
index 000000000000..88ee43c5d9cd
--- /dev/null
+++ b/examples/online_serving/token_generation_client.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import httpx
+from transformers import AutoTokenizer
+
+GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
+DUMMY_API_KEY = "empty"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+transport = httpx.HTTPTransport()
+headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
+client = httpx.Client(
+    transport=transport,
+    base_url=GEN_ENDPOINT,
+    timeout=600,
+    headers=headers,
+)
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "How many countries are in the EU?"},
+]
+
+
+def main(client):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
+        "stream": False,
+    }
+    resp = client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    print(data)
+    print("-" * 50)
+    print("Token generation results:")
+    res = tokenizer.decode(data["choices"][0]["token_ids"])
+    print(res)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main(client)
diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py
index e10ee4e2a9a9..53036b3eb0ff 100644
--- a/examples/others/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -37,7 +37,7 @@
 from vllm.engine.arg_utils import EngineArgs
 
 
-def setup_environment_variables(vllm_version: str):
+def setup_environment_variables():
     # LMCache-related environment variables
     # Use experimental features in LMCache
     os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
     os.environ["LMCACHE_LOCAL_CPU"] = "True"
     # Set local CPU memory limit to 5.0 GB
     os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
-    if vllm_version == "v0":
-        os.environ["VLLM_USE_V1"] = "0"
 
 
 @contextlib.contextmanager
-def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
+def build_llm_with_lmcache(lmcache_connector: str, model: str):
     ktc = KVTransferConfig(
         kv_connector=lmcache_connector,
         kv_role="kv_both",
@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # memory. Reduce the value if your GPU has less memory.
     # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
-    if vllm_version == "v0":
-        llm_args = EngineArgs(
-            model=model,
-            kv_transfer_config=ktc,
-            max_model_len=8000,
-            gpu_memory_utilization=0.8,
-            enable_chunked_prefill=True,  # Only in v0
-        )
-    else:
-        llm_args = EngineArgs(
-            model=model,
-            kv_transfer_config=ktc,
-            max_model_len=8000,
-            gpu_memory_utilization=0.8,
-        )
+    llm_args = EngineArgs(
+        model=model,
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+    )
 
     llm = LLM(**asdict(llm_args))
     try:
@@ -116,18 +105,10 @@ def parse_args():
 
 
 def main():
-    args = parse_args()
-
-    if args.version == "v0":
-        lmcache_connector = "LMCacheConnector"
-        model = "mistralai/Mistral-7B-Instruct-v0.2"
-    else:
-        lmcache_connector = "LMCacheConnectorV1"
-        model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-
-    setup_environment_variables(args.version)
-
-    with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
+    lmcache_connector = "LMCacheConnectorV1"
+    model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    setup_environment_variables()
+    with build_llm_with_lmcache(lmcache_connector, model) as llm:
         # This example script runs two requests with a shared prefix.
         # Define the shared prompt and specific prompts
         shared_prompt = "Hello, how are you?" * 1000
diff --git a/format.sh b/format.sh
deleted file mode 100755
index 6ba93e0a19ba..000000000000
--- a/format.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
-echo "Please run 'pip install -r requirements/lint.txt', followed by"
-echo "'pre-commit install' to install the pre-commit hooks."
-echo "Then linters will run automatically before each commit."
\ No newline at end of file
diff --git a/requirements/common.txt b/requirements/common.txt
index 8009581f62a4..1058ab91a02a 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -30,8 +30,8 @@ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/31
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
-gguf >= 0.13.0
-mistral_common[image,audio] >= 1.8.5
+gguf >= 0.17.0
+mistral_common[image] >= 1.8.5
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
@@ -49,3 +49,4 @@ cbor2 # Required for cross-language serialization of hashable objects
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
+model-hosting-container-standards < 1.0.0
\ No newline at end of file
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 331d02be6621..81d429a5e5f8 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,8 +4,9 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0; platform_system == "Darwin"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 605ce73bff9c..d11787df4d92 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -8,7 +8,7 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.8.0; platform_system == "Darwin"
+torch==2.9.0; platform_system == "Darwin"
 torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 4e393d6b6615..d63fe9e1e77c 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,7 +9,6 @@ torch==2.9.0
 torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# Build from https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
-xformers==0.0.33+5d4b92a5.d20251029; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
+xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.2
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 0fd6dbe22c51..32e004b2b64b 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -10,3 +10,7 @@ mkdocs-minify-plugin
 regex
 ruff
 pydantic
+
+# For generating argparse docs.
+# Adding requirements here should only be used as a last resort.
+msgspec  # Need for multiple inheritance involving msgspec.Struct
\ No newline at end of file
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 56ec90c563c0..b977e80be067 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -9,9 +9,9 @@ torchaudio==2.9.0
 triton==3.5.0
 cmake>=3.26.1,<4
 packaging>=24.2
-setuptools>=77.0.3,<81.0.0
+setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
-amdsmi==6.2.4
+amdsmi==6.4.3
 timm>=1.0.17
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index f06e4248a724..6f1cca90e5e2 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -10,7 +10,7 @@ peft
 pytest-asyncio
 tensorizer==2.10.1
 packaging>=24.2
-setuptools>=77.0.3,<81.0.0
+setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.0
 conch-triton-kernels==1.2.1
diff --git a/setup.py b/setup.py
index 8139d0d62b8a..e9b36e2a2e03 100644
--- a/setup.py
+++ b/setup.py
@@ -208,6 +208,8 @@ def configure(self, ext: CMakeExtension) -> None:
         # Make sure we use the nvcc from CUDA_HOME
         if _is_cuda():
             cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
+        elif _is_hip():
+            cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
 
         other_cmake_args = os.environ.get("CMAKE_ARGS")
         if other_cmake_args:
@@ -543,7 +545,9 @@ def get_vllm_version() -> str:
     # Allow overriding the version. This is useful to build platform-specific
     # wheels (e.g. CPU, TPU) without modifying the source.
     if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
-        return env_version
+        print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE")
+        os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
+        return get_version(write_to="vllm/_version.py")
 
     version = get_version(write_to="vllm/_version.py")
     sep = "+" if "+" not in version else "."  # dev versions might contain +
@@ -628,6 +632,7 @@ def _read_requirements(filename: str) -> list[str]:
 
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
@@ -643,7 +648,6 @@ def _read_requirements(filename: str) -> list[str]:
         ext_modules.append(
             CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
         )
-    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 0c037622f5e8..754ef20dbeb2 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -8,12 +8,13 @@
 
 from vllm import LLM, AsyncEngineArgs, AsyncLLMEngine, SamplingParams
 from vllm.device_allocator.cumem import CuMemAllocator
+from vllm.platforms import current_platform
 from vllm.utils.mem_constants import GiB_bytes
 
 from ..utils import create_new_process_for_each_test
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 def test_python_error():
     """
     Test if Python error occurs when there's low-level
@@ -39,7 +40,7 @@ def test_python_error():
         allocator.wake_up()
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 def test_basic_cumem():
     # some tensors from default memory pool
     shape = (1024, 1024)
@@ -72,7 +73,7 @@ def test_basic_cumem():
     assert torch.allclose(output, torch.ones_like(output) * 3)
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 def test_cumem_with_cudagraph():
     allocator = CuMemAllocator.get_instance()
     with allocator.use_memory_pool():
@@ -117,7 +118,7 @@ def model(x):
     assert torch.allclose(y, x + 1)
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
 @pytest.mark.parametrize(
     "model",
     [
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index 700f57ffb068..6d3788af9de0 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -22,6 +22,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
 
@@ -193,7 +195,14 @@ def run_model(
 
 
 @pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
-def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@create_new_process_for_each_test("spawn")
+def test_multi_graph_piecewise_compile(
+    use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch
+):
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
@@ -203,7 +212,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
             use_inductor_graph_partition=use_inductor_graph_partition,
@@ -281,7 +290,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=False,
+            cudagraph_mode=CUDAGraphMode.NONE,
             splitting_ops=["silly::attention"],
             use_inductor_graph_partition=use_inductor_graph_partition,
         )
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 228859532ef4..e258133ab50a 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -21,6 +21,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from ..silly_attention import get_global_counter, reset_global_counter
 
@@ -62,7 +64,6 @@ def _run_simple_model(
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
             use_inductor=use_inductor,
             splitting_ops=splitting_ops,
             use_inductor_graph_partition=use_inductor_graph_partition,
@@ -125,6 +126,7 @@ def _run_simple_model(
 
 @pytest.mark.parametrize("use_inductor", [True, False])
 @torch.inference_mode()
+@create_new_process_for_each_test("spawn")
 def test_simple_piecewise_compile(use_inductor):
     _run_simple_model(
         splitting_ops=["silly::attention"],
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 6887673eb6a5..915fbc6ce7f3 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -29,6 +29,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
 
@@ -334,6 +336,7 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
         ("inductor", True),  # Inductor, Inductor partition
     ],
 )
+@create_new_process_for_each_test("spawn")
 def test_toy_llama(
     backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
 ):
@@ -449,7 +452,6 @@ def benchmark():
         if piecewise:
             compilation_config = CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE,
-                use_cudagraph=True,
                 splitting_ops=["silly::attention"],
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
@@ -514,4 +516,8 @@ def benchmark():
 
 
 if __name__ == "__main__":
-    benchmark()
+    # Protect against subprocess reimport when using spawn_new_process_for_each_test
+    import os
+
+    if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
+        benchmark()
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 132a838b8d44..3f6898607f6b 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -127,7 +127,9 @@ def test_compile_correctness(
             CompilationMode.VLLM_COMPILE,
         ]:
             for mode in [CompilationMode.NONE, comp_mode]:
-                all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
+                all_args.append(
+                    final_args + [f"-O.mode={mode.name}", "-O.backend=inductor"]
+                )
 
             # inductor will change the output, so we only compare if the output
             # is close, not exactly the same.
@@ -146,7 +148,7 @@ def test_compile_correctness(
             CompilationMode.DYNAMO_TRACE_ONCE,
             CompilationMode.VLLM_COMPILE,
         ]:
-            all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
+            all_args.append(final_args + [f"-O.mode={mode.name}", "-O.backend=eager"])
             all_envs.append({})
             all_envs.append({})
 
diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py
new file mode 100644
index 000000000000..1467d6d5b1ba
--- /dev/null
+++ b/tests/compile/test_compile_ranges.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import fx as fx
+from torch import nn
+
+# This import automatically registers `torch.ops.silly.attention`
+import tests.compile.silly_attention  # noqa
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.compilation.inductor_pass import (
+    InductorPass,
+    get_pass_context,
+)
+from vllm.config import (
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.config.compilation import CompilationConfig, CompilationMode
+from vllm.config.scheduler import SchedulerConfig
+from vllm.config.utils import Range
+from vllm.forward_context import set_forward_context
+
+BATCH_SIZE = 64
+MLP_SIZE = 128
+
+
+@support_torch_compile
+class TestModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + x
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = x * 3
+        return x
+
+
+@torch.inference_mode
+def run_model(vllm_config: VllmConfig, model: nn.Module, batch_sizes: list[int]):
+    with set_forward_context({}, vllm_config=vllm_config):
+        model(torch.randn(BATCH_SIZE, MLP_SIZE))
+        for batch_size in batch_sizes:
+            model(torch.randn(batch_size, MLP_SIZE))
+
+
+class PostGradPassManagerCheckRanges(InductorPass):
+    def __init__(self, ranges: list[Range]):
+        self.ranges = ranges
+        self.num_calls = 0
+
+    def __call__(self, graph: fx.Graph):
+        compile_range = get_pass_context().compile_range
+        assert compile_range in self.ranges, (
+            f"Compile range {compile_range} not in {self.ranges}"
+        )
+        self.num_calls += 1
+
+    def uuid(self) -> str:
+        state = {
+            "ranges": [str(range) for range in self.ranges],
+            "current_compile_range": str(get_pass_context().compile_range),
+        }
+        return InductorPass.hash_dict(state)
+
+
+def test_compile_ranges():
+    post_grad_pass_manager = PostGradPassManagerCheckRanges(
+        [
+            Range(start=1, end=8),
+            Range(start=16, end=16),
+            Range(start=9, end=32),
+            Range(start=64, end=64),
+            Range(start=33, end=8192),
+        ]
+    )
+    torch.set_default_device("cuda")
+    vllm_config = VllmConfig(
+        scheduler_config=SchedulerConfig(
+            max_num_batched_tokens=8192,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            compile_ranges_split_points=[8, 32],
+            compile_sizes=[16, 64, 128],
+            inductor_compile_config={
+                "post_grad_custom_post_pass": post_grad_pass_manager,
+                # Disable inductor cache to get the number of passes correctly
+                "force_disable_caches": True,
+            },
+        ),
+    )
+
+    with set_current_vllm_config(vllm_config):
+        model = TestModel(vllm_config=vllm_config, prefix="").eval()
+        # Number of compilations: 3 for each compile range + 2 compile sizes
+        batch_sizes = [1, 4, 16, 24, 48, 64, 8192]
+        # A has support_torch_compile
+        with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=1,
+            num_backend_compilations=5,
+        ):
+            run_model(vllm_config, model, batch_sizes)
+        assert post_grad_pass_manager.num_calls == 5
+
+
+def test_compile_config_get_compile_ranges():
+    compilation_config = CompilationConfig(
+        compile_ranges_split_points=[8, 32],
+    )
+    VllmConfig(
+        scheduler_config=SchedulerConfig(
+            max_num_batched_tokens=8192,
+        ),
+        compilation_config=compilation_config,
+    )
+    assert compilation_config.get_compile_ranges() == [
+        Range(start=1, end=8),
+        Range(start=9, end=32),
+        Range(start=33, end=8192),
+    ]
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 7455147f2b95..1e8a882a7f3e 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -2,8 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 from contextlib import nullcontext
+from unittest.mock import patch
 
 import pytest
+from pydantic import ValidationError
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
@@ -11,7 +13,10 @@
 from vllm.config.compilation import CompilationMode
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
+from vllm.utils.torch_utils import _is_torch_equal_or_newer
+
+# This import automatically registers `torch.ops.silly.attention`
+from . import silly_attention  # noqa: F401
 
 
 def test_version():
@@ -23,14 +28,6 @@ def test_version():
     assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
 
 
-def test_use_cudagraphs_dynamic():
-    vllm_config = VllmConfig()
-    # Default V1 configuration now starts without cudagraphs enabled; the
-    # engine decides when to capture based on runtime settings instead of a
-    # blanket default.
-    assert vllm_config.compilation_config.use_cudagraph
-
-
 def test_copy_pass():
     vllm_config = VllmConfig()
     inductor_pass = FixFunctionalizationPass(vllm_config)
@@ -65,7 +62,7 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
     monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
 
     compilation_config = {
-        "use_cudagraph": False,  # speed things up a bit
+        "cudagraph_mode": CUDAGraphMode.NONE,  # speed things up a bit
     }
     with (
         compilation_counter.expect(
@@ -83,20 +80,31 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
-@pytest.mark.parametrize("enabled", [True, False])
-def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
+@pytest.mark.parametrize(
+    "cudagraph_mode,num_cudagraph_captured",
+    [
+        (CUDAGraphMode.NONE, 0),
+        (CUDAGraphMode.FULL_DECODE_ONLY, 1),
+        (CUDAGraphMode.PIECEWISE, 13),
+        (CUDAGraphMode.FULL_AND_PIECEWISE, 14),
+    ],
+)
+def test_use_cudagraphs(
+    vllm_runner, monkeypatch, cudagraph_mode, num_cudagraph_captured
+):
     # Disable multiprocessing so that the counter is in the same process
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     compilation_config = {
         "cudagraph_capture_sizes": [100],
-        "use_cudagraph": enabled,
+        "cudagraph_mode": cudagraph_mode,
     }
+    num_gpu_runner_capture_triggers = 1 if cudagraph_mode != CUDAGraphMode.NONE else 0
     with (
         compilation_counter.expect(
             num_graphs_seen=1,
-            num_gpu_runner_capture_triggers=1 if enabled else 0,
-            num_cudagraph_captured=13 if enabled else 0,
+            num_gpu_runner_capture_triggers=num_gpu_runner_capture_triggers,
+            num_cudagraph_captured=num_cudagraph_captured,
         ),
         # loading the model causes compilation (if enabled) to happen
         vllm_runner(
@@ -168,19 +176,18 @@ def test_splitting_ops_dynamic():
     assert not config.compilation_config.splitting_ops_contain_attention()
 
     # When use_inductor_graph_partition=True
-    if is_torch_equal_or_newer("2.9.0.dev"):
-        config = VllmConfig(
-            compilation_config=CompilationConfig(
-                mode=CompilationMode.VLLM_COMPILE,
-                use_inductor_graph_partition=True,
-                splitting_ops=["vllm::unified_attention"],
-            )
+    config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            splitting_ops=["vllm::unified_attention"],
         )
-        # with inductor partition we use splitting_ops directly for
-        # partition rules
-        assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
+    )
+    # with inductor partition we use splitting_ops directly for
+    # partition rules
+    assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
 
-    # When attn_fusion pass enabled, splitting_ops now default to attention ops.
+    # When attn_fusion pass enabled.
     config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
@@ -189,29 +196,41 @@ def test_splitting_ops_dynamic():
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
         )
     )
-    # With the new simplified logic, attention fusion works with splitting_ops
-    assert config.compilation_config.splitting_ops_contain_attention()
-    # cudagraph mode remains PIECEWISE
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
+    assert config.compilation_config.splitting_ops == []
+    # cudagraph mode also fall back to FULL
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
 
-    # When both use_inductor_graph_partition and attn_fusion pass enabled.
-    if is_torch_equal_or_newer("2.9.0.dev"):
+    # splitting_ops can not contain attention ops when attn_fusion
+    # pass enabled.
+    with pytest.raises(ValidationError):
         config = VllmConfig(
             compilation_config=CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE,
-                use_inductor_graph_partition=True,
                 pass_config={"enable_attn_fusion": True, "enable_noop": True},
                 custom_ops=["+quant_fp8"],
                 cudagraph_mode=CUDAGraphMode.PIECEWISE,
+                # work around for accessing all attntion ops
+                splitting_ops=CompilationConfig()._attention_ops,
             )
         )
-        # With inductor graph partition, attn_fusion and splitting_ops
-        # work together. Default splitting_ops include attention ops.
-        assert config.compilation_config.splitting_ops_contain_attention()
-        # enable_attn_fusion is directly supported under
-        # use_inductor_graph_partition=True, and cudagraph_mode
-        # is unchanged.
-        assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
+
+    # When both use_inductor_graph_partition and attn_fusion pass enabled.
+    config = VllmConfig(
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            use_inductor_graph_partition=True,
+            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            custom_ops=["+quant_fp8"],
+            cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        )
+    )
+    # With inductor graph partition, attn_fusion and splitting_ops
+    # work together. Default splitting_ops include attention ops.
+    assert config.compilation_config.splitting_ops_contain_attention()
+    # enable_attn_fusion is directly supported under
+    # use_inductor_graph_partition=True, and cudagraph_mode
+    # is unchanged.
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 
 
 def test_should_split():
@@ -241,15 +260,6 @@ def test_should_split():
     splitting_ops = ["aten::add.Tensor"]
     assert not should_split(node, splitting_ops)
 
-    @torch.library.custom_op(
-        "silly::attention",
-        mutates_args=["out"],
-    )
-    def attention(
-        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
-    ) -> None:
-        out.copy_(q + k + v)
-
     q, k, v, out = [torch.randn(1)] * 4
 
     # supports custom ops as OpOverloadPacket
@@ -293,25 +303,36 @@ def attention(
         "tp_size",
         "enable_sequence_parallelism",
         "max_num_batched_tokens",
-        "use_cudagraph",
+        "cudagraph_mode",
         "expected_max_size",
     ),
     [
-        (None, None, 1, False, 2048, True, 512),
-        ([1, 2, 4], 4, 1, False, 2048, True, 4),
-        ([1, 2, 4], 8, 1, False, 2048, True, RuntimeError),
-        ([1, 256], None, 1, False, 2048, 256),
-        ([], None, 1, False, 2048, False, 0),
-        (None, 0, 1, False, 2048, False, 0),
+        (None, None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        ([1, 2, 4], 4, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
+        (
+            [1, 2, 4],
+            8,
+            1,
+            False,
+            2048,
+            CUDAGraphMode.FULL_AND_PIECEWISE,
+            ValidationError,
+        ),
+        ([1, 256], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        ([], None, 1, False, 2048, CUDAGraphMode.NONE, 0),
+        (None, 0, 1, False, 2048, CUDAGraphMode.NONE, 0),
         # truncated to nearest multiple of 8 or 16
-        (None, 257, 1, False, 2048, True, 256),
-        ([1, 2, 4, 15], None, 1, False, 2048, True, 15),  # max from list
-        ([1, 2, 4, 15], None, 2, True, 2048, True, 4),  # filtered out 15 due to SP
-        ([1, 2, 4, 15], None, 1, False, 8, True, 4),  # limited by the max_tokens
+        (None, 257, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        # max from list
+        ([1, 2, 4, 15], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 15),
+        # filtered out 15 due to SP
+        ([1, 2, 4, 15], None, 2, True, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
+        # limited by the max_tokens
+        ([1, 2, 4, 15], None, 1, False, 8, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
         # the list should contain at least 1 element when use cudagraph
-        ([], None, 1, False, 2048, True, RuntimeError),
+        ([], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
         # the max capturing size should be >= 1 when use cudagraph
-        (None, 0, 1, False, 2048, True, RuntimeError),
+        (None, 0, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
     ],
 )
 def test_cudagraph_sizes_post_init(
@@ -320,15 +341,17 @@ def test_cudagraph_sizes_post_init(
     tp_size,
     enable_sequence_parallelism,
     max_num_batched_tokens,
-    use_cudagraph,
+    cudagraph_mode,
     expected_max_size,
 ):
     ctx = nullcontext()
-    if isinstance(expected_max_size, Exception):
+    if expected_max_size == ValidationError:
         ctx = pytest.raises(expected_max_size)
 
-    cudagraph_mode = CUDAGraphMode.PIECEWISE if use_cudagraph else CUDAGraphMode.NONE
-    with ctx:
+    with (
+        ctx,
+        patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
+    ):
         compilation_config = CompilationConfig(
             cudagraph_capture_sizes=cudagraph_capture_sizes,
             max_cudagraph_capture_size=max_cudagraph_capture_size,
@@ -342,11 +365,13 @@ def test_cudagraph_sizes_post_init(
         engine_args = EngineArgs(
             model="facebook/opt-125m",
             tensor_parallel_size=tp_size,
+            max_num_seqs=min(max_num_batched_tokens, 128),
             max_num_batched_tokens=max_num_batched_tokens,
             compilation_config=compilation_config,
         )
         vllm_config = engine_args.create_engine_config()
 
-    assert (
-        vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size
-    )
+        assert (
+            vllm_config.compilation_config.max_cudagraph_capture_size
+            == expected_max_size
+        )
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
index c9d01f2317d2..1850cc8f1479 100644
--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -80,7 +80,6 @@ def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatc
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
             use_inductor_graph_partition=use_inductor_graph_partition,
@@ -215,7 +214,6 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
         ),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
             use_inductor_graph_partition=use_inductor_graph_partition,
@@ -257,7 +255,6 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
         ),
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
             use_inductor_graph_partition=use_inductor_graph_partition,
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 0ad8c17d8668..b4e5e56ac9fe 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -10,6 +10,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -183,8 +184,25 @@ def test_custom_compile_config(
     "compilation_mode",
     [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
 )
-def test_fp8_kv_scale_compile(compilation_mode: int):
-    model = "Qwen/Qwen2-0.5B"
+@pytest.mark.parametrize(
+    "model, backend",
+    [
+        ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+        (
+            "deepseek-ai/DeepSeek-V2-Lite",
+            AttentionBackendEnum.FLASHINFER_MLA,
+        ),  # MLA (Multi-head Latent Attention) model
+    ],
+)
+def test_fp8_kv_scale_compile(
+    monkeypatch: pytest.MonkeyPatch,
+    compilation_mode: int,
+    model: str,
+    backend: AttentionBackendEnum | None,
+):
+    if backend:
+        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
     model_kwargs = {
         "quantization": "fp8",
         "kv_cache_dtype": "fp8_e4m3",
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index fecb1e2e918f..ea61c94953a7 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -10,7 +10,7 @@
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention import Attention, AttentionMetadata
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
@@ -104,7 +104,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
 
         # TODO(luka) use get_kv_cache_stride_order
         # Create dummy KV cache for the selected backend
-        if backend == _Backend.ROCM_ATTN:
+        if backend == AttentionBackendEnum.ROCM_ATTN:
             # k/v as 1st dimention
             # HND: [num_blocks, num_kv_heads, block_size, head_size]
             kv_cache = torch.zeros(
@@ -116,7 +116,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
                 dtype=self.kv_cache_dtype,
                 device=self.device,
             )
-        elif backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
+        elif backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
             # k/v as 1st dimention
             # NHD: [num_blocks, block_size, num_kv_heads, head_size]
             kv_cache = torch.zeros(
@@ -128,7 +128,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
                 dtype=self.kv_cache_dtype,
                 device=self.device,
             )
-        elif backend == _Backend.TRITON_ATTN:
+        elif backend == AttentionBackendEnum.TRITON_ATTN:
             # k/v as 2nd dimention
             # NHD: [num_blocks, block_size, num_kv_heads, head_size]
             kv_cache = torch.zeros(
@@ -140,7 +140,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
                 dtype=self.kv_cache_dtype,
                 device=self.device,
             )
-        elif backend == _Backend.FLASHINFER:
+        elif backend == AttentionBackendEnum.FLASHINFER:
             kv_cache = torch.zeros(
                 num_blocks,
                 2,
@@ -244,8 +244,8 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
 MODELS_FP4: list[tuple[str, type]] = []
 HEADS: list[tuple[int, int]] = []
 SPLIT_ATTENTION: list[bool] = []
-BACKENDS_FP8: list[_Backend] = []
-BACKENDS_FP4: list[_Backend] = []
+BACKENDS_FP8: list[AttentionBackendEnum] = []
+BACKENDS_FP4: list[AttentionBackendEnum] = []
 
 if current_platform.is_cuda():
     HEADS = [(64, 8), (40, 8)]
@@ -261,8 +261,8 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
             TestAttentionNvfp4QuantPatternModel,
         )
     ]
-    BACKENDS_FP8 = [_Backend.TRITON_ATTN, _Backend.FLASHINFER]
-    BACKENDS_FP4 = [_Backend.FLASHINFER]
+    BACKENDS_FP8 = [AttentionBackendEnum.TRITON_ATTN, AttentionBackendEnum.FLASHINFER]
+    BACKENDS_FP4 = [AttentionBackendEnum.FLASHINFER]
 
 elif current_platform.is_rocm():
     HEADS = [(32, 8), (40, 8)]
@@ -270,9 +270,9 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
         ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
     ]
     BACKENDS = [
-        _Backend.ROCM_AITER_UNIFIED_ATTN,
-        _Backend.ROCM_ATTN,
-        _Backend.TRITON_ATTN,
+        AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
+        AttentionBackendEnum.ROCM_ATTN,
+        AttentionBackendEnum.TRITON_ATTN,
     ]
 
 
@@ -302,11 +302,11 @@ def test_attention_quant_pattern(
     custom_ops: str,
     model_name: str,
     model_class: type[AttentionQuantPatternModel],
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     dist_init,
 ):
     """Test AttentionStaticQuantPattern fusion pass"""
-    if backend == _Backend.FLASHINFER and (
+    if backend == AttentionBackendEnum.FLASHINFER and (
         not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
     ):
         pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
@@ -314,6 +314,7 @@ def test_attention_quant_pattern(
     custom_ops_list = custom_ops.split(",") if custom_ops else []
 
     device = torch.device("cuda:0")
+    torch.set_default_dtype(dtype)
     torch.manual_seed(42)
 
     vllm_config = VllmConfig(
@@ -402,7 +403,7 @@ def test_attention_quant_pattern(
 
         result_fused_1 = model_compiled(q, k, v)
 
-        if backend == _Backend.FLASHINFER:
+        if backend == AttentionBackendEnum.FLASHINFER:
             # With the Flashinfer backend after the 1st round of the forward
             # pass, output quant scale should be loaded into the attn layer's
             # _o_scale_float, the 2nd round should reuse the loaded
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
index 4b910bc28579..f22d60ef000b 100644
--- a/tests/compile/test_fusions_e2e.py
+++ b/tests/compile/test_fusions_e2e.py
@@ -11,7 +11,7 @@
 import pytest
 import regex as re
 
-from tests.v1.attention.utils import _Backend
+from tests.v1.attention.utils import AttentionBackendEnum
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
@@ -20,13 +20,22 @@
 
 from ..utils import flat_product, multi_gpu_test
 
+is_blackwell = lambda: current_platform.is_device_capability(100)
+"""Are we running on Blackwell, a lot of tests depend on it"""
+
+
+class Matches(NamedTuple):
+    attention_fusion: int = 0
+    allreduce_fusion: int = 0
+    sequence_parallel: int = 0
+    async_tp: int = 0
+
 
 class ModelBackendTestCase(NamedTuple):
     model_name: str
     model_kwargs: dict[str, Any]
-    backend: _Backend
-    attention_fusions: int
-    allreduce_fusions: int | None = None
+    backend: AttentionBackendEnum
+    matches: Matches
 
 
 MODELS_FP8: list[ModelBackendTestCase] = []
@@ -38,17 +47,33 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             # Use smaller model for L40s in CI
             model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
-            attention_fusions=32,
-            allreduce_fusions=65,
+            # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
+            #  so FI attention+fp8_quant is at least tested once
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            backend=AttentionBackendEnum.FLASHINFER
+            if is_blackwell()
+            else AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=32,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
         ),
         ModelBackendTestCase(
             model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=_Backend.FLASHINFER,
-            attention_fusions=48,
-            allreduce_fusions=96,
+            # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
+            # https://github.com/vllm-project/vllm/issues/28568
+            # TODO FlashInfer attn broken on Blackwell for llama4:
+            # https://github.com/vllm-project/vllm/issues/28604
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=48,
+                allreduce_fusion=96,
+                sequence_parallel=96,
+                async_tp=95,  # mlp is moe, no fusion there
+            ),
         ),
     ]
 
@@ -56,9 +81,13 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=_Backend.FLASHINFER,
-            attention_fusions=32,
-            allreduce_fusions=65,
+            backend=AttentionBackendEnum.FLASHINFER,
+            matches=Matches(
+                attention_fusion=32,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
         ),
     ]
 
@@ -67,16 +96,24 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
-            attention_fusions=0,
-            allreduce_fusions=65,
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
         ),
         ModelBackendTestCase(
             model_name="Qwen/Qwen3-30B-A3B",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
-            attention_fusions=0,
-            allreduce_fusions=97,
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=97,
+                sequence_parallel=97,
+                async_tp=96,  # MLP is MoE, half the fusions of dense
+            ),
         ),
     ]
 
@@ -85,20 +122,20 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
-            attention_fusions=32,
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(attention_fusion=32),
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.ROCM_ATTN,
-            attention_fusions=32,
+            backend=AttentionBackendEnum.ROCM_ATTN,
+            matches=Matches(attention_fusion=32),
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.ROCM_AITER_UNIFIED_ATTN,
-            attention_fusions=32,
+            backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
+            matches=Matches(attention_fusion=32),
         ),
     ]
 
@@ -106,8 +143,7 @@ class ModelBackendTestCase(NamedTuple):
 
 
 @pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, "
-    "attention_fusions, allreduce_fusions, custom_ops",
+    "model_name, model_kwargs, backend, matches, custom_ops",
     # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
     list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
     # quant_fp4 only has the custom impl
@@ -117,16 +153,15 @@ class ModelBackendTestCase(NamedTuple):
 def test_attn_quant(
     model_name: str,
     model_kwargs: dict[str, Any],
-    backend: _Backend,
-    attention_fusions: int,
-    allreduce_fusions: int,
+    backend: AttentionBackendEnum,
+    matches: Matches,
     custom_ops: str,
     inductor_graph_partition: bool,
     caplog_mp_spawn,
     monkeypatch,
 ):
-    if backend == _Backend.FLASHINFER and (
-        not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
+    if backend == AttentionBackendEnum.FLASHINFER and (
+        not is_blackwell() or not has_flashinfer()
     ):
         pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
     if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
@@ -169,12 +204,12 @@ def test_attn_quant(
     with caplog_mp_spawn(logging.DEBUG) as log_holder:
         run_model(compilation_config, model_name, **model_kwargs)
 
-    matches = re.findall(
+    log_matches = re.findall(
         r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
         log_holder.text,
     )
-    assert len(matches) == 1, log_holder.text
-    assert int(matches[0]) == attention_fusions
+    assert len(log_matches) == 1, log_holder.text
+    assert int(log_matches[0]) == matches.attention_fusion
 
 
 CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
@@ -187,8 +222,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, "
-    "attention_fusions, allreduce_fusions, custom_ops",
+    "model_name, model_kwargs, backend, matches, custom_ops",
     # Toggle RMSNorm and QuantFP8 for FP8 models
     list(
         flat_product(
@@ -208,9 +242,8 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
 def test_tp2_attn_quant_allreduce_rmsnorm(
     model_name: str,
     model_kwargs: dict,
-    backend: _Backend,
-    attention_fusions: int,
-    allreduce_fusions: int,
+    backend: AttentionBackendEnum,
+    matches: Matches,
     custom_ops: str,
     inductor_graph_partition: bool,
     caplog_mp_spawn,
@@ -219,6 +252,13 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("Inductor graph partition requires torch>=2.9")
 
+    if "fp4" in model_name.lower() and not is_blackwell():
+        pytest.skip("NVFP4 quant requires Blackwell")
+
+    if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
+        # FlashInfer attn fusion requires Blackwell
+        matches = matches._replace(attention_fusion=0)
+
     custom_ops_list = custom_ops.split(",") if custom_ops else []
 
     if inductor_graph_partition:
@@ -258,23 +298,135 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
         run_model(
             compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
         )
-    matches = re.findall(
+    log_matches = re.findall(
+        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.attention_fusion
+    assert int(log_matches[1]) == matches.attention_fusion
+
+    log_matches = re.findall(
+        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.allreduce_fusion
+    assert int(log_matches[1]) == matches.allreduce_fusion
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, model_kwargs, backend, matches, custom_ops",
+    # Toggle RMSNorm and QuantFP8 for FP8 models
+    list(
+        flat_product(
+            MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
+        )
+    )
+    # Toggle RMSNorm for FP4 models and unquant models
+    + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
+)
+@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="sequence parallel only tested on CUDA",
+)
+def test_tp2_attn_quant_async_tp(
+    model_name: str,
+    model_kwargs: dict,
+    backend: AttentionBackendEnum,
+    matches: Matches,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    caplog_mp_spawn,
+    monkeypatch,
+):
+    if is_blackwell():
+        # TODO: https://github.com/vllm-project/vllm/issues/27893
+        pytest.skip("Blackwell is not supported for AsyncTP pass")
+
+    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition requires torch>=2.9")
+
+    if "fp4" in model_name.lower() and not is_blackwell():
+        pytest.skip("NVFP4 quant requires Blackwell")
+
+    if backend == AttentionBackendEnum.FLASHINFER:
+        if not has_flashinfer():
+            pytest.skip("FlashInfer backend requires flashinfer installed")
+        if not is_blackwell():
+            # FlashInfer attn fusion requires Blackwell
+            matches = matches._replace(attention_fusion=0)
+
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
+
+    if inductor_graph_partition:
+        mode = CUDAGraphMode.FULL_AND_PIECEWISE
+        splitting_ops: list[str] | None = None
+    else:
+        mode = CUDAGraphMode.FULL_DECODE_ONLY
+        splitting_ops = []
+
+    # Disable, compile cache to make sure custom passes run.
+    # Otherwise, we can't verify fusion happened through the logs.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    # To capture subprocess logs, we need to know whether spawn or fork is used.
+    # Force spawn as it is more general.
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    compilation_config = CompilationConfig(
+        # Testing properties
+        use_inductor_graph_partition=inductor_graph_partition,
+        cudagraph_mode=mode,
+        custom_ops=custom_ops_list,
+        splitting_ops=splitting_ops,
+        # Common
+        level=CompilationMode.VLLM_COMPILE,
+        pass_config=PassConfig(
+            enable_attn_fusion=True,
+            enable_noop=True,
+            enable_sequence_parallelism=True,
+            enable_async_tp=True,
+        ),
+        # Inductor caches custom passes by default as well via uuid
+        inductor_compile_config={"force_disable_caches": True},
+    )
+
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+        run_model(
+            compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
+        )
+    log_matches = re.findall(
         r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
         log_holder.text,
     )
-    assert len(matches) == 2, log_holder.text
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.attention_fusion
+    assert int(log_matches[1]) == matches.attention_fusion
+
+    log_matches = re.findall(
+        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
 
-    assert int(matches[0]) == attention_fusions
-    assert int(matches[1]) == attention_fusions
+    assert int(log_matches[0]) == matches.sequence_parallel
+    assert int(log_matches[1]) == matches.sequence_parallel
 
-    matches = re.findall(
+    log_matches = re.findall(
         r"collective_fusion.py:\d+] Replaced (\d+) patterns",
         log_holder.text,
     )
-    assert len(matches) == 2, log_holder.text
+    assert len(log_matches) == 2, log_holder.text
 
-    assert int(matches[0]) == allreduce_fusions
-    assert int(matches[1]) == allreduce_fusions
+    assert int(log_matches[0]) == matches.async_tp
+    assert int(log_matches[1]) == matches.async_tp
 
 
 def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
diff --git a/tests/compile/test_graph_partition.py b/tests/compile/test_graph_partition.py
new file mode 100644
index 000000000000..1cd783843a62
--- /dev/null
+++ b/tests/compile/test_graph_partition.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import operator
+
+import pytest
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from vllm.compilation.backends import split_graph
+
+
+def test_getitem_moved_to_producer_subgraph():
+    """
+    Test that getitem operations are moved to the same subgraph as their input,
+    preventing tuple inputs to submodules.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # torch.split returns a tuple, creating real getitem operations
+        # Should become first submodule that produces tuple
+        chunks = torch.split(x, x.shape[0] // 2, dim=0)
+
+        # Following ops should become second submodule that consumes tuple
+        result_0 = torch.relu(chunks[0])
+        result_1 = torch.relu(chunks[1])
+        return torch.cat([result_0, result_1], dim=0)
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    has_getitem = any(
+        node.op == "call_function" and node.target == operator.getitem
+        for node in gm.graph.nodes
+    )
+    assert has_getitem, "Test setup failed: graph should contain getitem operations"
+
+    # Split on tuple producer aten::split
+    split_ops = ["aten::split.Tensor"]
+    split_gm, split_items = split_graph(gm, split_ops)
+    assert len(split_items) == 2, "Graph should be split into 2 submodules"
+
+    for split_item in split_items:
+        submodule = split_item.graph
+
+        getitem_on_placeholder = []
+        for node in submodule.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == operator.getitem
+                and node.args[0].op == "placeholder"
+            ):
+                getitem_on_placeholder.append(node)
+
+        assert len(getitem_on_placeholder) == 0, (
+            f"Submodule {split_item.submod_name} has getitem operations on "
+            f"placeholder nodes: {[n.name for n in getitem_on_placeholder]}. "
+            "This means tuple inputs were not properly eliminated."
+        )
+
+    new_x = torch.randn(4, 3)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+
+    assert torch.allclose(output_original, output_split), "Output mismatch"
+
+
+def test_no_tuple_inputs_with_multiple_consumers():
+    """
+    Test that when a tuple is consumed by multiple split operations,
+    getitem operations are properly moved to avoid tuple inputs.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # torch.split returns a tuple, creating real getitem operations
+        # Should become first submodule that produces tuple
+        chunks = torch.split(x, x.shape[0] // 2, dim=0)
+
+        # These should become second submodule consuming tuple
+        result_1 = torch.relu(chunks[0])
+        result_2 = torch.relu(chunks[1])
+
+        # Artificial graph splitting point to create another
+        # independent submodule that consumes tuple later
+        # This would become the third submodule
+        result_1 = torch.sigmoid(result_1)
+
+        # Fourth submodule that consumes tuple
+        result = torch.cat([chunks[0], chunks[1], result_1, result_2])
+        return result
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    has_getitem = any(
+        node.op == "call_function" and node.target == operator.getitem
+        for node in gm.graph.nodes
+    )
+    assert has_getitem, "Test setup failed: graph should contain getitem operations"
+
+    split_ops = ["aten::split.Tensor", "aten::sigmoid"]
+    split_gm, split_items = split_graph(gm, split_ops)
+    assert len(split_items) == 4, "Graph should be split into 4 submodules"
+
+    for split_item in split_items:
+        submodule = split_item.graph
+
+        for node in submodule.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == operator.getitem
+                and node.args[0].op == "placeholder"
+            ):
+                pytest.fail(
+                    f"Submodule {split_item.submod_name} has getitem on "
+                    f"placeholder {node.args[0].name}, indicating it receives "
+                    "a tuple input"
+                )
+
+    new_x = torch.randn(4, 3)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/test_multimodal_compile.py
index b76c29819a2d..621f6a51a918 100644
--- a/tests/compile/test_multimodal_compile.py
+++ b/tests/compile/test_multimodal_compile.py
@@ -10,8 +10,8 @@
 
 def test_compile():
     vllm_config = VllmConfig()
-    # Default configuration compiles mm encoder
-    assert vllm_config.compilation_config.compile_mm_encoder
+    # Default configuration does not compile mm encoder
+    assert not vllm_config.compilation_config.compile_mm_encoder
 
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@@ -39,7 +39,10 @@ def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
             "Qwen/Qwen2.5-VL-3B-Instruct",
             max_model_len=2048,
             gpu_memory_utilization=0.8,
-            compilation_config={"mode": CompilationMode.VLLM_COMPILE},
+            compilation_config={
+                "mode": CompilationMode.VLLM_COMPILE,
+                "compile_mm_encoder": True,
+            },
         ) as _,
     ):
         pass
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
new file mode 100644
index 000000000000..511e50f5fdc2
--- /dev/null
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.compile.backend import TestBackend
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.compilation.post_cleanup import PostCleanupPass
+from vllm.compilation.qk_norm_rope_fusion import (
+    FUSED_QK_ROPE_OP,
+    QKNormRoPEFusionPass,
+)
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+
+RSQRT_OP = torch.ops.aten.rsqrt.default
+INDEX_SELECT_OP = torch.ops.aten.index.Tensor
+
+
+class QKNormRoPETestModel(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        eps: float,
+        is_neox: bool,
+        vllm_config: VllmConfig,
+        dtype: torch.dtype,
+        prefix: str = "model.layers.0.self_attn.attn",
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.q_size = num_heads * head_dim
+        self.kv_size = num_kv_heads * head_dim
+        self.rotary_dim = head_dim
+        self.eps = eps
+        self.dtype = dtype
+
+        # Register layer metadata for the fusion pass via Attention.
+        self.attn = Attention(
+            num_heads=self.num_heads,
+            head_size=self.head_dim,
+            scale=1.0 / self.head_dim**0.5,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            prefix=prefix,
+            attn_type=AttentionType.DECODER,
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=self.eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=self.eps)
+        self.rotary_emb = RotaryEmbedding(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position_embeddings=4096,
+            base=10000,
+            is_neox_style=is_neox,
+            dtype=self.dtype,
+        )
+        self.enable_rms_norm_custom_op = self.q_norm.enabled()
+        self.enable_rope_custom_op = self.rotary_emb.enabled()
+
+    def forward(self, qkv: torch.Tensor, positions: torch.Tensor):
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
+        q_by_head = self.q_norm(q_by_head)
+        q = q_by_head.view(q.shape)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
+        k_by_head = self.k_norm(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        return q, k, v
+
+    def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
+        ops = []
+        if self.enable_rms_norm_custom_op:
+            ops.append(RMS_OP)
+        else:
+            ops.append(RSQRT_OP)
+
+        if self.enable_rope_custom_op:
+            if self.rotary_emb.use_flashinfer:
+                ops.append(FLASHINFER_ROTARY_OP)
+            else:
+                ops.append(ROTARY_OP)
+        else:
+            ops.append(INDEX_SELECT_OP)
+        return ops
+
+    def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
+        return [FUSED_QK_ROPE_OP]
+
+
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.parametrize("is_neox", [True, False])
+@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
+@pytest.mark.parametrize("enable_rope_custom_op", [True])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+def test_qk_norm_rope_fusion(
+    eps, is_neox, enable_rms_norm_custom_op, enable_rope_custom_op, dtype
+):
+    if not hasattr(torch.ops._C, "fused_qk_norm_rope"):
+        pytest.skip("fused_qk_norm_rope custom op not available")
+
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    custom_ops: list[str] = []
+    if enable_rms_norm_custom_op:
+        custom_ops.append("+rms_norm")
+    if enable_rope_custom_op:
+        custom_ops.append("+rotary_embedding")
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            custom_ops=custom_ops,
+            pass_config=PassConfig(
+                enable_qk_norm_rope_fusion=True,
+                enable_noop=True,
+            ),
+        ),
+    )
+
+    num_heads, num_kv_heads, head_dim = 16, 4, 128
+    T = 5
+
+    with set_current_vllm_config(vllm_config):
+        model = QKNormRoPETestModel(
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            eps=eps,
+            is_neox=is_neox,
+            vllm_config=vllm_config,
+            dtype=dtype,
+        )
+
+        noop_pass = NoOpEliminationPass(vllm_config)
+        fusion_pass = QKNormRoPEFusionPass(vllm_config)
+        cleanup_pass = PostCleanupPass(vllm_config)
+
+        backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
+        backend_baseline = TestBackend(noop_pass, cleanup_pass)
+
+        qkv = torch.randn(T, model.q_size + 2 * model.kv_size)
+        pos = torch.arange(T, dtype=torch.long, device=qkv.device)
+        qkv_unfused = qkv.clone()
+        pos_unfused = pos.clone()
+
+        torch._dynamo.mark_dynamic(qkv, 0)
+        torch._dynamo.mark_dynamic(pos, 0)
+        model_fused = torch.compile(model, backend=backend)
+        q_fused, k_fused, v_fused = model_fused(qkv, pos)
+
+        torch._dynamo.mark_dynamic(qkv_unfused, 0)
+        torch._dynamo.mark_dynamic(pos_unfused, 0)
+        model_unfused = torch.compile(model, backend=backend_baseline)
+        q_unfused, k_unfused, v_unfused = model_unfused(qkv_unfused, pos_unfused)
+
+        if dtype == torch.float16:
+            ATOL, RTOL = (2e-3, 2e-3)
+        else:
+            ATOL, RTOL = (1e-2, 1e-2)
+
+        torch.testing.assert_close(q_unfused, q_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(k_unfused, k_fused, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(v_unfused, v_fused, atol=ATOL, rtol=RTOL)
+
+        assert fusion_pass.matched_count == 1
+
+        backend.check_before_ops(model.ops_in_model_before())
+        backend.check_after_ops(model.ops_in_model_after())
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index e909cf7393ad..9cd7f64b04af 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -5,15 +5,15 @@
 import torch
 
 import vllm.envs as envs
-from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.compilation.fusion import RMSNormQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.fx_utils import find_auto_fn
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
 from vllm.compilation.vllm_inductor_pass import VllmInductorPass
 from vllm.config import (
     CompilationConfig,
+    CUDAGraphMode,
     DeviceConfig,
     ModelConfig,
     PassConfig,
@@ -27,6 +27,7 @@
     initialize_model_parallel,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
@@ -43,172 +44,157 @@
 ]
 
 
-class TestModel(torch.nn.Module):
-    def __init__(self, hidden_size=16, intermediate_size=32):
+class TestAllReduceRMSNormModel(torch.nn.Module):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size))
-        )
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
 
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
+    def forward(self, x):
+        z = torch.relu(x)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
 
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
+        z2 = torch.mm(y, self.w[0])
+        x2 = tensor_model_parallel_all_reduce(z2)
 
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
+        y2, resid = self.norm[1](x2, resid)
 
-        # matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
+        z3 = torch.mm(y2, self.w[1])
+        x3 = tensor_model_parallel_all_reduce(z3)
 
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
+        y3, resid = self.norm[2](x3, resid)
 
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
+        z4 = torch.mm(y3, self.w[2])
+        x4 = tensor_model_parallel_all_reduce(z4)
 
-        return norm_output, residual_output
+        y4, resid = self.norm[3](x4, resid)
+        return y4
 
     def ops_in_model_before(self):
         return [torch.ops.vllm.all_reduce.default]
 
     def ops_in_model_after(self):
         return [
-            torch.ops.vllm.reduce_scatter.default,
             torch.ops.vllm.all_gather.default,
+            torch.ops.vllm.reduce_scatter.default,
         ]
 
     def ops_in_model(self):
-        return [torch.ops._C.fused_add_rms_norm.default]
+        if RMSNorm.enabled():
+            return [
+                torch.ops._C.rms_norm.default,
+                torch.ops._C.fused_add_rms_norm.default,
+            ]
+        else:
+            return []
 
 
-class TestQuantModel(torch.nn.Module):
-    def __init__(self, hidden_size=16, intermediate_size=32):
+class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
         self.vllm_config = get_current_vllm_config()
-        self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size)), requires_grad=False
+        self.hidden_size = hidden_size
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size)
+            .to(dtype=current_platform.fp8_dtype())
+            .t()
+            for _ in range(3)
+        ]
+
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=True,
+            act_quant_group_shape=GroupShape.PER_TENSOR,
         )
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
-
-        self.fp8_linear = Fp8LinearOp(act_quant_static=True)
-
-        self.scale = torch.rand(1, dtype=torch.float32)
-        # Create a weight that is compatible with torch._scaled_mm,
-        # which expects a column-major layout.
-        self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
-        self.wscale = torch.rand(1, dtype=torch.float32)
-
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
-
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
-
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
-
-        # matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
-
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
-
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
-
-        # scaled_mm with static input quantization
-        fp8_linear_result = self.fp8_linear.apply(
-            norm_output,
-            self.w,
-            self.wscale,
-            input_scale=self.scale.to(norm_output.device),
+
+        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+
+    def forward(self, hidden_states):
+        # avoid having graph input be an arg to a pattern directly
+        z = torch.relu(hidden_states)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
+
+        z2 = self.fp8_linear.apply(
+            y, self.w[0], self.wscale[0], input_scale=self.scale[0]
         )
 
-        return fp8_linear_result, residual_output
+        x2 = tensor_model_parallel_all_reduce(z2)
+        y2, resid = self.norm[1](x2, resid)
 
-    def ops_in_model_before(self):
-        ops_to_remove = [torch.ops.vllm.all_reduce.default]  # Always removed by SP
-        # The following are only removed if fusion happens
-        if (
-            self.vllm_config
-            and self.vllm_config.compilation_config.pass_config.enable_fusion
-        ):
-            ops_to_remove.extend(
-                [
-                    torch.ops._C.fused_add_rms_norm.default,
-                    torch.ops._C.static_scaled_fp8_quant.default,
-                ]
-            )
-        return ops_to_remove
+        z3 = self.fp8_linear.apply(
+            y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
+        )
+
+        x3 = tensor_model_parallel_all_reduce(z3)
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+
+        z4 = self.fp8_linear.apply(
+            y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
+        )
+        x4 = tensor_model_parallel_all_reduce(z4)
+        y4, resid = self.norm[3](x4, resid)  # use resid here
+        return y4
 
     def ops_in_model_after(self):
-        ops_to_add = [
-            torch.ops.vllm.reduce_scatter.default,
+        return [
             torch.ops.vllm.all_gather.default,
+            torch.ops.vllm.reduce_scatter.default,
+        ]
+
+    def ops_in_model_before(self):
+        return [
+            torch.ops.vllm.all_reduce.default,
         ]
-        # The following is only added if fusion happens
-        if (
-            self.vllm_config
-            and self.vllm_config.compilation_config.pass_config.enable_fusion
-        ):
-            ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
-        return ops_to_add
 
     def ops_in_model(self):
-        if (
-            self.vllm_config
-            and self.vllm_config.compilation_config.pass_config.enable_fusion
-        ):
-            # If fusion happens, the fused op is the one
-            # we check for (de)functionalization
+        if self.vllm_config.compilation_config.pass_config.enable_fusion:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
-        else:
-            # If no fusion, the original ops are checked
+        elif RMSNorm.enabled():
             return [
                 torch.ops._C.fused_add_rms_norm.default,
-                # TODO  functionalization pass does not handle this yet
-                # torch.ops._C.static_scaled_fp8_quant.default,
             ]
+        elif self.fp8_linear.quant_fp8.enabled():
+            return [
+                torch.ops._C.static_scaled_fp8_quant.default,
+            ]
+        else:
+            return []
 
 
 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel])
+@pytest.mark.parametrize(
+    "test_model_cls, custom_ops",
+    [
+        (TestAllReduceRMSNormModel, "+rms_norm"),
+        (TestAllReduceRMSNormModel, "-rms_norm"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"),
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("enable_fusion", [True, False])
+@pytest.mark.parametrize("dynamic", [False, True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_sequence_parallelism_pass(
     test_model_cls: type[torch.nn.Module],
+    custom_ops: str,
     batch_size: int,
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
     enable_fusion: bool,
+    dynamic: bool,
 ):
     num_processes = 2
 
@@ -220,11 +206,13 @@ def run_torch_spawn(fn, nprocs):
             args=(
                 num_processes,
                 test_model_cls,
+                custom_ops,
                 batch_size,
                 seq_len,
                 hidden_size,
                 dtype,
                 enable_fusion,
+                dynamic,
             ),
             nprocs=nprocs,
         )
@@ -236,11 +224,13 @@ def sequence_parallelism_pass_on_test_model(
     local_rank: int,
     world_size: int,
     test_model_cls: type[torch.nn.Module],
+    custom_ops: str,
     batch_size: int,
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
     enable_fusion: bool,
+    dynamic: bool,
 ):
     current_platform.seed_everything(0)
 
@@ -264,12 +254,16 @@ def sequence_parallelism_pass_on_test_model(
     initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # configure vllm config for SequenceParallelismPass
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
     compilation_config = CompilationConfig(
+        splitting_ops=[],  # avoid automatic rms_norm enablement
+        cudagraph_mode=CUDAGraphMode.NONE,  # avoid piecewise warnings
+        custom_ops=custom_ops_list,
         pass_config=PassConfig(
             enable_sequence_parallelism=True,
             enable_fusion=enable_fusion,
             enable_noop=True,
-        )
+        ),
     )  # NoOp needed for fusion
     device_config = DeviceConfig(device=torch.device("cuda"))
 
@@ -289,7 +283,6 @@ def sequence_parallelism_pass_on_test_model(
     with set_current_vllm_config(vllm_config):
         noop_pass = NoOpEliminationPass(vllm_config)
         sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
-        func_pass = FixFunctionalizationPass(vllm_config)
         cleanup_pass = PostCleanupPass(vllm_config)
         assert (
             sequence_parallelism_pass.compilation_config.splitting_ops
@@ -310,38 +303,29 @@ def sequence_parallelism_pass_on_test_model(
 
         passes_for_backend.append(cleanup_pass)
 
-        backend_no_func = TestBackend(*passes_for_backend)
-        backend_func = TestBackend(*passes_for_backend, func_pass)
+        backend = TestBackend(*passes_for_backend)
 
-        model = test_model_cls(hidden_size, hidden_size * 2)
+        model = test_model_cls(hidden_size)
 
         hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
-        residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
 
-        compiled_model_no_func = torch.compile(model, backend=backend_no_func)
-        compiled_model_no_func(hidden_states, residual)
-        compiled_model_func = torch.compile(model, backend=backend_func)
-        compiled_model_func(hidden_states, residual)
+        if dynamic:
+            torch._dynamo.mark_dynamic(hidden_states, 0)
+
+        compiled_model = torch.compile(model, backend=backend)
+        compiled_model(hidden_states)
 
-        assert sequence_parallelism_pass.matched_count == 1
+        assert sequence_parallelism_pass.matched_count == 4
 
         # In pre-nodes, all reduce should be there,
         # reduce scatter and all gather should not
-        backend_no_func.check_before_ops(model.ops_in_model_before())
+        for op in model.ops_in_model_before():
+            assert backend.op_count(op, before=True) == 4
 
         # In post-nodes, reduce scatter and all gather should be there,
         # all reduce should not
-        backend_no_func.check_after_ops(model.ops_in_model_after())
+        for op in model.ops_in_model_after():
+            assert backend.op_count(op, before=False) == 4
 
-        # check if the functionalization pass is applied
         for op in model.ops_in_model():
-            find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
-            assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
-
-        # make sure the ops were all de-functionalized
-        found = dict()
-        for node in backend_func.graph_post_pass.nodes:
-            for op in model.ops_in_model():
-                if is_func(node, op):
-                    found[op] = True
-        assert all(found[op] for op in model.ops_in_model())
+            find_auto_fn(backend.graph_post_pass.nodes, op)
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index da0afd9eaa49..356cac7af258 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -2,59 +2,134 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+import os
+
+import pytest
 import torch
 
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationMode
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
 
 
 class MyMod(torch.nn.Module):
     def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
-        if cache is not None:
-            return x + cache
-        return x * 2
+        if x.size()[0] >= 4:
+            return x * 2
+        else:
+            return x * 100
 
 
-class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
+class MyWrapper(TorchCompileWithNoGuardsWrapper):
     def __init__(self, model):
         self.model = model
-        compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(
-            compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
-        )
+        super().__init__()
 
-    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
+    def forward(self, x: torch.Tensor):  # type: ignore[override]
         # this is the function to be compiled
-        return self.model(x, cache)
-
-    def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
-        # let torch.compile compile twice
-        if len(self.compiled_codes) == 2:
-            dispatch_id = 0 if cache is None else 1
-            with self.dispatch_to_code(dispatch_id):
-                return self.forward(x, cache)
-        else:
-            return self.compiled_callable(x, cache)
+        return self.model(x)
+
 
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
+    """Test basic functionality of TorchCompileWithNoGuardsWrapper."""
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
 
-def test_torch_compile_wrapper():
-    mod = MyMod()
-    wrappers = []
-    for i in range(3):
+    # Create a proper vLLM config instead of mocking
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig()
+    vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
+    vllm_config.compilation_config.backend = "inductor"
+
+    # Test DYNAMO_TRACE_ONCE
+    with set_current_vllm_config(vllm_config):
         torch._dynamo.reset()
+        mod = MyMod()
+        wrapper = MyWrapper(mod)
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should use compiled code
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([2, 4, 6])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+        # without the wrapper result would be different.
+        result3 = mod(x2)
+        expected3 = torch.tensor([100, 200, 300])
+
+        assert torch.allclose(result3, expected3), (
+            f"Expected {result3}, got {expected3}"
+        )
+
+    # with STOCK_TORCH_COMPILE we do not remove guards.
+    vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        mod = MyMod()
         wrapper = MyWrapper(mod)
-        wrappers.append(wrapper)
-        x = torch.tensor([1])
-        wrapper(x, None)  # profile run, compile
-        # create a cache tensor
-        cache = torch.tensor([2])
-        wrapper(x, cache)  # warm up with cache, recompile
-
-        # for new input, dispatch to the compiled code directly
-        new_x = torch.tensor([3])
-        assert wrapper(new_x, None).item() == 6  # dispatch to the first compiled code
-        assert wrapper(new_x, cache).item() == 5  # dispatch to the second compiled code
-
-    for wrapper in wrappers:
-        # make sure they have independent compiled codes
-        assert len(wrapper.compiled_codes) == 2
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should triger another compilation
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([100, 200, 300])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+    # NO_COMPILATION level not supported.
+    vllm_config.compilation_config.mode = None
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        torch._dynamo.reset()
+        mod = MyMod()
+
+        try:
+            wrapper = MyWrapper(mod)
+        except Exception:
+            return
+        raise AssertionError("expected an exception to be raised")
+
+
+if __name__ == "__main__":
+    # Run with both parameter values
+
+    class MockMonkeypatch:
+        def setenv(self, name, value):
+            os.environ[name] = value
+
+    mp = MockMonkeypatch()
+
+    print("Testing with VLLM_USE_BYTECODE_HOOK=False")
+    test_torch_compile_wrapper(False, mp)
+
+    print("Testing with VLLM_USE_BYTECODE_HOOK=True")
+    test_torch_compile_wrapper(True, mp)
+
+    print("All tests passed!")
diff --git a/tests/config/test_multimodal_config.py b/tests/config/test_multimodal_config.py
index b1a09d88ed9d..3d02893e52f1 100644
--- a/tests/config/test_multimodal_config.py
+++ b/tests/config/test_multimodal_config.py
@@ -3,13 +3,13 @@
 
 import pytest
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MultiModalConfig
 
 
 def test_mm_encoder_attn_backend_str_conversion():
     config = MultiModalConfig(mm_encoder_attn_backend="FLASH_ATTN")
-    assert config.mm_encoder_attn_backend == _Backend.FLASH_ATTN
+    assert config.mm_encoder_attn_backend == AttentionBackendEnum.FLASH_ATTN
 
 
 def test_mm_encoder_attn_backend_invalid():
@@ -20,6 +20,6 @@ def test_mm_encoder_attn_backend_invalid():
 def test_mm_encoder_attn_backend_hash_updates():
     base_hash = MultiModalConfig().compute_hash()
     overridden_hash = MultiModalConfig(
-        mm_encoder_attn_backend=_Backend.FLASH_ATTN
+        mm_encoder_attn_backend=AttentionBackendEnum.FLASH_ATTN
     ).compute_hash()
     assert base_hash != overridden_hash
diff --git a/tests/conftest.py b/tests/conftest.py
index 5e127e4e939e..b17081352edc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1384,3 +1384,16 @@ def image_urls(request, local_asset_server) -> list[str]:
     """Indirect fixture: takes a list of names, returns list of full URLs."""
     names: list[str] = request.param
     return [local_asset_server.url_for(name) for name in names]
+
+
+@pytest.fixture
+def disable_deepgemm_ue8m0(monkeypatch):
+    from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+    with monkeypatch.context() as monkeypatch_ctx:
+        monkeypatch_ctx.setenv("VLLM_USE_DEEP_GEMM_E8M0", "0")
+        is_deep_gemm_e8m0_used.cache_clear()
+        yield
+        # Clear cache so the next time it is used it is processed with the
+        # default VLLM_USE_DEEP_GEMM_E8M0  setting.
+        is_deep_gemm_e8m0_used.cache_clear()
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 7f8e77a75621..b16fd0d06b14 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -14,6 +14,7 @@
 from typing import Literal, NamedTuple
 
 import pytest
+import torch
 
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
@@ -38,6 +39,7 @@ class ParallelSetup(NamedTuple):
 class CPTestOptions(NamedTuple):
     multi_node_only: bool
     load_format: str | None = None
+    attn_backend: str | None = None
 
 
 @dataclass
@@ -57,6 +59,7 @@ def detailed(
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
         load_format: str | None = None,
+        attn_backend: str | None = None,
     ):
         parallel_setups = []
         for eager_mode_val in [False]:
@@ -78,7 +81,9 @@ def detailed(
             distributed_backends=["mp"],
             runner=runner,
             test_options=CPTestOptions(
-                multi_node_only=multi_node_only, load_format=load_format
+                multi_node_only=multi_node_only,
+                load_format=load_format,
+                attn_backend=attn_backend,
             ),
         )
 
@@ -116,7 +121,7 @@ def _compare_cp_with_tp(
         chunked_prefill,
     ) = parallel_setup
 
-    multi_node_only, load_format = test_options
+    multi_node_only, load_format, attn_backend = test_options
 
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_transformers_version(on_fail="skip")
@@ -176,6 +181,13 @@ def _compare_cp_with_tp(
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
+    if not attn_backend:
+        cp_env = tp_env = {}
+    else:
+        cp_env = tp_env = {
+            "VLLM_ATTENTION_BACKEND": attn_backend,
+        }
+
     cp_args = [
         *common_args,
         "--tensor-parallel-size",
@@ -204,6 +216,8 @@ def _compare_cp_with_tp(
         model_id,
         cp_args,
         tp_args,
+        cp_env,
+        tp_env,
         method=method,
         max_wait_seconds=720,
     )
@@ -254,6 +268,17 @@ def test_cp_generation(
     test_options: CPTestOptions,
     num_gpus_available,
 ):
+    if (
+        model_id == "deepseek-ai/DeepSeek-V2-Lite-Chat"
+        and torch.cuda.get_device_capability() < (9, 0)
+    ):
+        pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
+    if (
+        model_id == "bigcode/gpt_bigcode-santacoder"
+        and torch.cuda.get_device_capability() != (9, 0)
+    ):
+        pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
+
     _compare_cp_with_tp(
         model_id,
         parallel_setup,
diff --git a/tests/distributed/test_multiproc_executor.py b/tests/distributed/test_multiproc_executor.py
new file mode 100644
index 000000000000..e741a79bc4ed
--- /dev/null
+++ b/tests/distributed/test_multiproc_executor.py
@@ -0,0 +1,437 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Integration tests for MultiprocExecutor at the executor level.
+This test directly tests the executor without going through the LLM interface,
+focusing on executor initialization, RPC calls, and distributed execution.
+"""
+
+import multiprocessing
+import os
+
+from tests.utils import multi_gpu_test
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import get_open_port
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+
+MODEL = "facebook/opt-125m"
+
+
+def create_vllm_config(
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    max_model_len: int = 256,
+    gpu_memory_utilization: float = 0.3,
+    distributed_executor_backend: str = "mp",
+    nnodes: int = 1,
+    node_rank: int = 0,
+    master_port: int = 0,
+) -> VllmConfig:
+    """Create a VllmConfig for testing using EngineArgs."""
+    engine_args = EngineArgs(
+        model=MODEL,
+        tensor_parallel_size=tensor_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+
+    # Override distributed node settings if needed
+    if nnodes > 1 or node_rank > 0:
+        vllm_config.parallel_config.nnodes = nnodes
+        vllm_config.parallel_config.node_rank = node_rank
+        vllm_config.parallel_config.master_port = master_port
+    if nnodes > 1:
+        vllm_config.parallel_config.disable_custom_all_reduce = True
+
+    return vllm_config
+
+
+def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
+    """Create a minimal SchedulerOutput for testing."""
+    # This is a simplified version - in practice you'd need proper
+    # SchedulerOutput construction based on the actual vLLM v1 API
+    return SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_resumed_reqs=[],
+        scheduled_running_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+    )
+
+
+def test_multiproc_executor_initialization():
+    """Test that MultiprocExecutor can be initialized with proper config."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor - this should initialize workers
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor properties
+    assert executor.world_size == 1, "World size should be 1 for single GPU"
+    assert executor.local_world_size == 1, "Local world size should be 1"
+    assert hasattr(executor, "workers"), "Executor should have workers"
+    assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
+
+    # Clean up
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_initialization_tensor_parallel():
+    """Test MultiprocExecutor initialization with tensor parallelism."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor properties
+    assert executor.world_size == 2, "World size should be 2 for TP=2"
+    assert executor.local_world_size == 2, "Local world size should be 2"
+    assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
+
+    # Verify output rank calculation
+    output_rank = executor._get_output_rank()
+    assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
+
+    # Clean up
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_collective_rpc():
+    """Test collective RPC calls to all workers."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test check_health RPC - should work without errors
+        executor.check_health()
+
+        # Test that RPC works correctly
+        # Note: We're just testing that the RPC mechanism works,
+        # not testing actual model execution here
+        assert not executor.is_failed, "Executor should not be in failed state"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_failure_callback():
+    """Test failure callback registration and invocation."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test callback registration
+        callback_invoked = []
+
+        def test_callback():
+            callback_invoked.append(True)
+
+        # Register callback
+        executor.register_failure_callback(test_callback)
+
+        # Callback should not be invoked yet
+        assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
+
+        # Simulate failure
+        executor.is_failed = True
+
+        # Register another callback - should be invoked immediately
+        executor.register_failure_callback(test_callback)
+        assert len(callback_invoked) == 1, (
+            "Callback should be invoked when executor is failed"
+        )
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_worker_monitor():
+    """Test that worker monitor is set up correctly."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Verify all worker processes are alive
+        for worker in executor.workers:
+            assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
+
+        # Verify executor is not in failed state
+        assert not executor.is_failed, "Executor should not be in failed state"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+        # After shutdown, workers should be terminated
+        import time
+
+        time.sleep(0.5)  # Give processes time to terminate
+        for worker in executor.workers:
+            assert not worker.proc.is_alive(), (
+                f"Worker rank {worker.rank} should terminate after shutdown"
+            )
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_get_response_message_queues():
+    """Test message queue retrieval for different ranks."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Get all message queues
+        all_queues = executor.get_response_mqs()
+        assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
+
+        # Get message queue for specific rank
+        rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
+        assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
+
+        rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
+        assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_shutdown_cleanup():
+    """Test that shutdown properly cleans up resources."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor is set up
+    assert hasattr(executor, "workers"), "Executor should have workers"
+    assert len(executor.workers) > 0, "Should have at least one worker"
+
+    # Shutdown
+    executor.shutdown()
+
+    # Verify cleanup
+    import time
+
+    time.sleep(0.5)  # Give processes time to terminate
+
+    for worker in executor.workers:
+        assert not worker.proc.is_alive(), "Worker processes should be terminated"
+
+    # Verify shutdown event is set
+    assert executor.shutdown_event.is_set(), "Shutdown event should be set"
+
+    # Multiple shutdowns should be safe (idempotent)
+    executor.shutdown()
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=4)
+def test_multiproc_executor_pipeline_parallel():
+    """Test MultiprocExecutor with pipeline parallelism."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Verify executor properties
+        assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
+        assert len(executor.workers) == 4, "Should have 4 workers"
+
+        # Verify output rank calculation
+        # For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
+        # Specifically rank 2 (first rank of last PP stage)
+        output_rank = executor._get_output_rank()
+        assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
+
+        # Verify max_concurrent_batches for pipeline parallel
+        assert executor.max_concurrent_batches == 2, (
+            "Max concurrent batches should equal PP size"
+        )
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_properties():
+    """Test various executor properties and configurations."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test supports_pp property
+        assert MultiprocExecutor.supports_pp is True, (
+            "MultiprocExecutor should support pipeline parallelism"
+        )
+
+        # Test world_size calculation
+        assert executor.world_size == (
+            executor.parallel_config.tensor_parallel_size
+            * executor.parallel_config.pipeline_parallel_size
+        ), "World size should equal TP * PP"
+
+        # Test local_world_size calculation
+        assert executor.local_world_size == (
+            executor.parallel_config.world_size // executor.parallel_config.nnodes
+        ), "Local world size should be world_size / nnodes"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=4)
+def test_multiproc_executor_multi_node():
+    """
+    Test MultiprocExecutor with multi-node configuration.
+    This simulates 2 nodes with TP=4:
+    - Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
+    - Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
+    Total world_size = 4, nnodes = 2
+    """
+    port = get_open_port()
+    # symm_mem does not work for simulating multi instance in single node
+    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+    def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
+        """Run a single node's executor."""
+        executor = None
+        try:
+            # Set CUDA_VISIBLE_DEVICES for this node
+            if node_rank == 0:
+                os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+            else:
+                os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
+
+            # Create config for this node
+            vllm_config = create_vllm_config(
+                tensor_parallel_size=4,  # Total TP across all nodes
+                pipeline_parallel_size=1,
+                nnodes=2,  # 2 nodes
+                node_rank=node_rank,
+                master_port=port,  # same port
+            )
+
+            # Create executor for this node
+            executor = MultiprocExecutor(vllm_config=vllm_config)
+
+            # Verify node-specific properties
+            assert executor.world_size == 4, (
+                f"World size should be 4 on node {node_rank}"
+            )
+            assert executor.local_world_size == 2, (
+                f"Local world size should be 2 on node {node_rank}"
+            )
+            assert len(executor.workers) == 2, (
+                f"Should have 2 local workers on node {node_rank}"
+            )
+
+            # Verify worker ranks are correct for this node
+            expected_ranks = [node_rank * 2, node_rank * 2 + 1]
+            actual_ranks = sorted([w.rank for w in executor.workers])
+            assert actual_ranks == expected_ranks, (
+                f"Node {node_rank} should have workers "
+                f"with ranks {expected_ranks}, got {actual_ranks}"
+            )
+            # Verify all workers are alive
+            for worker in executor.workers:
+                assert worker.proc.is_alive(), (
+                    f"Worker rank {worker.rank} should be alive on node {node_rank}"
+                )
+            # executor.gen
+            # Put success result in queue BEFORE shutdown to avoid hanging
+            result_queue.put({"node": node_rank, "success": True})
+            import time
+
+            time.sleep(2)
+            executor.shutdown()
+        except Exception as e:
+            # Put failure result in queue
+            result_queue.put({"node": node_rank, "success": False, "error": str(e)})
+            raise e
+        finally:
+            if executor is not None:
+                executor.shutdown()
+
+    # Create a queue to collect results from both processes
+    result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
+
+    # Start both node processes
+    processes = []
+    for node_rank in range(2):
+        p = multiprocessing.Process(
+            target=run_node,
+            args=(node_rank, result_queue, port),
+            name=f"Node{node_rank}",
+        )
+        p.start()
+        processes.append(p)
+
+    # Wait for both processes to complete
+    all_completed = True
+    for p in processes:
+        p.join(timeout=60)
+        if p.is_alive():
+            p.terminate()
+            p.join(timeout=20)
+            if p.is_alive():
+                p.kill()
+                p.join()
+            all_completed = False
+
+    # Check results from both nodes
+    results: list[dict[str, int | bool]] = []
+    while len(results) < 2:
+        try:
+            result = result_queue.get(timeout=1)
+            results.append(result)
+        except Exception:
+            pass
+    assert all_completed, "Not all processes completed successfully"
+    assert len(results) == 2, f"Expected 2 results, got {len(results)}"
+    assert results[0]["success"], f"Node 0 failed: {results[0]}"
+    assert results[1]["success"], f"Node 1 failed: {results[1]}"
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index 94b2b51211a6..f38c509775ed 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -18,6 +18,7 @@
 from vllm.config.compilation import CompilationMode
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -161,6 +162,7 @@ def _compare_sp(
     test_options: SPTestOptions,
     num_gpus_available: int,
     use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
     *,
     method: Literal["generate", "encode"],
     is_multimodal: bool,
@@ -244,10 +246,10 @@ def _compare_sp(
 
     compilation_config = {
         "mode": CompilationMode.VLLM_COMPILE,
-        "custom_ops": ["+rms_norm"],
         "compile_sizes": [4, 8],
         "pass_config": {
             "enable_sequence_parallelism": True,
+            "enable_async_tp": enable_async_tp,
             "enable_fusion": enable_fusion,
             "enable_noop": True,
         },
@@ -307,6 +309,7 @@ def _compare_sp(
     ],
 )
 @pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+@pytest.mark.parametrize("enable_async_tp", [False])  # TODO: enable async TP
 @create_new_process_for_each_test()
 def test_tp_sp_generation(
     model_id: str,
@@ -316,10 +319,19 @@ def test_tp_sp_generation(
     test_options: SPTestOptions,
     num_gpus_available,
     use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
 ):
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
+    # Skip FP8 SP-only test on sm89 (compute capability 8.9)
+    if (
+        "fp8" in model_id.lower()
+        and current_platform.get_device_capability() < (9, 0)
+        and (not enable_async_tp)
+    ):
+        pytest.skip("FP8 reduction support begins with sm90 capable devices.")
+
     _compare_sp(
         model_id,
         parallel_setup,
@@ -328,6 +340,7 @@ def test_tp_sp_generation(
         test_options,
         num_gpus_available,
         use_inductor_graph_partition,
+        enable_async_tp=enable_async_tp,
         method="generate",
         is_multimodal=False,
     )
diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py
index 3ddf2308eb1d..9d527c45c1fa 100644
--- a/tests/entrypoints/openai/test_enable_force_include_usage.py
+++ b/tests/entrypoints/openai/test_enable_force_include_usage.py
@@ -17,7 +17,7 @@ def chat_server_with_force_include_usage(request):  # noqa: F811
         "128",
         "--enforce-eager",
         "--max-num-seqs",
-        "1",
+        "4",
         "--enable-force-include-usage",
         "--port",
         "55857",
@@ -78,7 +78,7 @@ def transcription_server_with_force_include_usage():
         "--dtype",
         "bfloat16",
         "--max-num-seqs",
-        "1",
+        "4",
         "--enforce-eager",
         "--enable-force-include-usage",
         "--gpu-memory-utilization",
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index dbcec9d31fc9..4e7b765d7713 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -16,6 +16,7 @@
 
 from vllm import version
 
+from ...conftest import LocalAssetServer
 from ...utils import RemoteOpenAIServer
 
 MODELS = {
@@ -69,7 +70,6 @@ async def client(server):
 
 
 _PROMPT = "Hello my name is Robert and I love magic"
-_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
 
 def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
@@ -250,6 +250,7 @@ async def test_metrics_counts(
 
 @pytest.mark.asyncio
 async def test_metrics_exist(
+    local_asset_server: LocalAssetServer,
     server: RemoteOpenAIServer,
     client: openai.AsyncClient,
     model_key: str,
@@ -265,13 +266,21 @@ async def test_metrics_exist(
             temperature=0.0,
         )
     else:
+        # https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
         await client.chat.completions.create(
             model=model_name,
             messages=[
                 {
                     "role": "user",
                     "content": [
-                        {"type": "image_url", "image_url": {"url": _IMAGE_URL}},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": local_asset_server.url_for(
+                                    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                                ),
+                            },
+                        },
                         {"type": "text", "text": "What's in this image?"},
                     ],
                 }
diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py
index d32cfde07c21..1ed44a33bf81 100644
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -22,9 +22,6 @@ def monkeypatch_module():
 
 @pytest.fixture(scope="module", params=[True])
 def server(request, monkeypatch_module):
-    use_v1 = request.param
-    monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-
     args = [
         "--dtype",
         "bfloat16",
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
index 788a1e912182..93e11b61020c 100644
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -34,6 +34,9 @@ def __init__(self):
     def append_output(self, output) -> None:
         pass
 
+    def append_tool_output(self, output) -> None:
+        pass
+
     async def call_tool(self):
         return []
 
diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py
new file mode 100644
index 000000000000..62d843e35b86
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import httpx
+import pytest
+import pytest_asyncio
+from transformers import AutoTokenizer
+
+from vllm.config import ModelConfig
+from vllm.v1.engine.detokenizer import check_stop_strings
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+GEN_ENDPOINT = "/inference/v1/generate"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module")
+def messages():
+    return [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "How many countries are in the EU?"},
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(request):
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    extra_args = getattr(request, "param", None)
+    if extra_args is not None:
+        args = args + (
+            list(extra_args)
+            if isinstance(extra_args, (list, tuple))
+            else [str(extra_args)]
+        )
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    async with httpx.AsyncClient(
+        transport=transport,
+        base_url=server.url_root,
+        timeout=600,
+        headers=headers,
+    ) as c:
+        yield c
+
+
+@pytest.mark.asyncio
+async def test_generate_endpoint(client):
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+
+@pytest.mark.asyncio
+async def test_same_response_as_chat_completions(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    for ignore_eos in [True, False]:
+        payload = {
+            "model": MODEL_NAME,
+            "token_ids": token_ids,
+            "sampling_params": {
+                "max_tokens": 24,
+                "temperature": 0.0,
+                # NOTE coordinator will set this to skip detokenization
+                "detokenize": False,
+                "ignore_eos": ignore_eos,
+            },
+            "stream": False,
+        }
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_data = generate_resp.json()
+        generate_res = tokenizer.decode(
+            generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+        )
+
+        payload = {
+            "model": MODEL_NAME,
+            "messages": messages,
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "stream": False,
+            "ignore_eos": ignore_eos,
+            "chat_template_kwargs": dict(enable_thinking=False),
+        }
+        completions_resp = await client.post("/v1/chat/completions", json=payload)
+        completions_data = completions_resp.json()
+        completions_res = completions_data["choices"][0]["message"]["content"]
+
+        assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+async def test_stop_string_workflow(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+            # stop strings are only supported when detokenize is True.
+            "stop": ["27 member"],
+        },
+        # TODO stream test is much more interesting
+        "stream": False,
+    }
+    with pytest.raises(httpx.HTTPStatusError):
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_resp.raise_for_status()
+
+    payload["sampling_params"]["stop"] = None
+    generate_resp = await client.post(
+        GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
+    )
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    # NOTE This is under the responsibility of the coordinator
+    # stop_checker = StopChecker(
+    #     max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
+    # )
+    stop_str, truncate_to = check_stop_strings(
+        generate_res, len(generate_res), ["27 member"], False
+    )
+    assert stop_str == "27 member"
+    # abort request that hit stop string (requires tokens-only mode)
+    # res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
+    # res.raise_for_status()
+    generate_res = generate_res[:truncate_to]
+
+    # Get stop_str response from chat completions
+    payload = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "stop": ["27 member"],
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+    assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "server",
+    [
+        [
+            "--enable-lora",
+            "--lora-modules",
+            "Alice=charent/self_cognition_Alice",
+            "Bob=charent/self_cognition_Bob",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+        ]
+    ],
+    indirect=True,
+)
+async def test_generate_with_lora_adapter(client, tokenizer, messages):
+    # Verify adapters are listed
+    models_resp = await client.get("/v1/models")
+    models_resp.raise_for_status()
+    models = {m["id"] for m in models_resp.json().get("data", [])}
+    assert {"Alice", "Bob"}.issubset(models)
+
+    # Generate using a LoRA adapter by specifying its name as the model
+    payload = {
+        "model": "Alice",
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": "Alice",
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+        },
+        "stream": False,
+    }
+    generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    payload = {
+        "model": "Alice",
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+
+    assert generate_res == completions_res
diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py
index 671bb948780a..25080d4189c2 100644
--- a/tests/entrypoints/pooling/openai/test_classification.py
+++ b/tests/entrypoints/pooling/openai/test_classification.py
@@ -46,6 +46,16 @@ def test_single_input_classification(server: RemoteOpenAIServer, model_name: str
     assert hasattr(output.data[0], "probs")
 
 
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
+    response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": "hello", "add_special_tokens": False},
+    )
+    response.raise_for_status()
+    ClassificationResponse.model_validate(response.json())
+
+
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
     input_texts = [
diff --git a/tests/entrypoints/pooling/openai/test_vision_classification.py b/tests/entrypoints/pooling/openai/test_vision_classification.py
new file mode 100644
index 000000000000..f2616e057b17
--- /dev/null
+++ b/tests/entrypoints/pooling/openai/test_vision_classification.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.openai.protocol import ClassificationResponse
+
+VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
+MAXIMUM_VIDEOS = 1
+TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
+
+HF_OVERRIDES = {
+    "text_config": {
+        "architectures": ["Qwen2_5_VLForSequenceClassification"],
+    },
+}
+
+
+@pytest.fixture(scope="module")
+def server_vlm_classify():
+    args = [
+        "--runner",
+        "pooling",
+        "--max-model-len",
+        "5000",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
+    ]
+
+    with RemoteOpenAIServer(
+        VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_text_only(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this text request."},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 22
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_video_url(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this video."},
+                {"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 4807
diff --git a/tests/entrypoints/sagemaker/__init__.py b/tests/entrypoints/sagemaker/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
new file mode 100644
index 000000000000..4c859c2527d2
--- /dev/null
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Shared fixtures and utilities for SageMaker tests."""
+
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# Model name constants used across tests
+MODEL_NAME_ZEPHYR = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
+LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"
+
+# SageMaker header constants
+HEADER_SAGEMAKER_CLOSED_SESSION_ID = "X-Amzn-SageMaker-Closed-Session-Id"
+HEADER_SAGEMAKER_SESSION_ID = "X-Amzn-SageMaker-Session-Id"
+HEADER_SAGEMAKER_NEW_SESSION_ID = "X-Amzn-SageMaker-New-Session-Id"
+
+
+@pytest.fixture(scope="session")
+def smollm2_lora_files():
+    """Download LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id=LORA_ADAPTER_NAME_SMOLLM)
+
+
+@pytest.fixture(scope="module")
+def basic_server_with_lora(smollm2_lora_files):
+    """Basic server fixture with standard configuration."""
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--max-lora-rank",
+        "256",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+    with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def async_client(basic_server_with_lora: RemoteOpenAIServer):
+    """Async OpenAI client fixture for use with basic_server."""
+    async with basic_server_with_lora.get_async_client() as async_client:
+        yield async_client
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py b/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py
new file mode 100644
index 000000000000..0d4f8e885824
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py
@@ -0,0 +1,734 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Integration tests for handler override functionality.
+
+Tests real customer usage scenarios:
+- Using @custom_ping_handler and @custom_invocation_handler decorators
+  to override handlers
+- Setting environment variables for handler specifications
+- Writing customer scripts with custom_sagemaker_ping_handler() and
+  custom_sagemaker_invocation_handler() functions
+- Priority: env vars > decorators > customer script files > framework
+  defaults
+
+Note: These tests focus on validating server responses rather than directly calling
+get_ping_handler() and get_invoke_handler() to ensure full integration testing.
+"""
+
+import os
+import tempfile
+
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import (
+    MODEL_NAME_SMOLLM,
+)
+
+
+class TestHandlerOverrideIntegration:
+    """Integration tests simulating real customer usage scenarios.
+
+    Each test simulates a fresh server startup where customers:
+    - Use @custom_ping_handler and @custom_invocation_handler decorators
+    - Set environment variables (CUSTOM_FASTAPI_PING_HANDLER, etc.)
+    - Write customer scripts with custom_sagemaker_ping_handler() and
+      custom_sagemaker_invocation_handler() functions
+    """
+
+    def setup_method(self):
+        """Setup for each test - simulate fresh server startup."""
+        self._clear_caches()
+        self._clear_env_vars()
+
+    def teardown_method(self):
+        """Cleanup after each test."""
+        self._clear_env_vars()
+
+    def _clear_caches(self):
+        """Clear handler registry and function loader cache."""
+        try:
+            from model_hosting_container_standards.common.handler import (
+                handler_registry,
+            )
+            from model_hosting_container_standards.sagemaker.sagemaker_loader import (
+                SageMakerFunctionLoader,
+            )
+
+            handler_registry.clear()
+            SageMakerFunctionLoader._default_function_loader = None
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+    def _clear_env_vars(self):
+        """Clear SageMaker environment variables."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+
+            # Clear SageMaker env vars
+            for var in [
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME,
+            ]:
+                os.environ.pop(var, None)
+
+            # Clear FastAPI env vars
+            for var in [
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER,
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER,
+            ]:
+                os.environ.pop(var, None)
+        except ImportError:
+            pass
+
+    @pytest.mark.asyncio
+    async def test_customer_script_functions_auto_loaded(self):
+        """Test customer scenario: script functions automatically override
+        framework defaults."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script file with ping() and invoke() functions
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request
+
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "customer_override", 
+        "message": "Custom ping from customer script"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Custom response from customer script"],
+        "source": "customer_override"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Customer sets SageMaker environment variables to point to their script
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Customer tests their server and sees their overrides work
+                # automatically
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Customer sees their functions are used
+                assert ping_data["source"] == "customer_override"
+                assert ping_data["message"] == "Custom ping from customer script"
+                assert invoke_data["source"] == "customer_override"
+                assert invoke_data["predictions"] == [
+                    "Custom response from customer script"
+                ]
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_customer_decorator_usage(self):
+        """Test customer scenario: using @custom_ping_handler and
+        @custom_invocation_handler decorators."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script file with decorators
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import Request
+
+@sagemaker_standards.custom_ping_handler
+async def my_ping():
+    return {
+        "type": "ping",
+        "source": "customer_decorator"
+    }
+
+@sagemaker_standards.custom_invocation_handler  
+async def my_invoke(request: Request):
+    return {
+        "type": "invoke", 
+        "source": "customer_decorator"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Customer sees their handlers are used by the server
+                assert ping_data["source"] == "customer_decorator"
+                assert invoke_data["source"] == "customer_decorator"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_handler_priority_order(self):
+        """Test priority: @custom_ping_handler/@custom_invocation_handler
+        decorators vs script functions."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script with both decorator and regular functions
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import Request
+
+# Customer uses @custom_ping_handler decorator (higher priority than script functions)
+@sagemaker_standards.custom_ping_handler
+async def decorated_ping():
+    return {
+        "status": "healthy",
+        "source": "ping_decorator_in_script", 
+        "priority": "decorator"
+    }
+
+# Customer also has a regular function (lower priority than
+# @custom_ping_handler decorator)
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "script_function",
+        "priority": "function"
+    }
+
+# Customer has a regular invoke function
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Script function response"],
+        "source": "script_invoke_function",
+        "priority": "function"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # @custom_ping_handler decorator has higher priority than
+                # script function
+                assert ping_data["source"] == "ping_decorator_in_script"
+                assert ping_data["priority"] == "decorator"
+
+                # Script function is used for invoke
+                assert invoke_data["source"] == "script_invoke_function"
+                assert invoke_data["priority"] == "function"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_environment_variable_script_loading(self):
+        """Test that environment variables correctly specify script location
+        and loading."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a script in a specific directory
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request
+
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "env_loaded_script",
+        "method": "environment_variable_loading"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Loaded via environment variables"],
+        "source": "env_loaded_script",
+        "method": "environment_variable_loading"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Test environment variable script loading
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Verify that the script was loaded via environment variables
+                assert ping_data["source"] == "env_loaded_script"
+                assert ping_data["method"] == "environment_variable_loading"
+                assert invoke_data["source"] == "env_loaded_script"
+                assert invoke_data["method"] == "environment_variable_loading"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_framework_default_handlers(self):
+        """Test that framework default handlers work when no customer
+        overrides exist."""
+        args = [
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "2048",
+            "--enforce-eager",
+            "--max-num-seqs",
+            "32",
+        ]
+
+        # Explicitly pass empty env_dict to ensure no SageMaker env vars are set
+        # This prevents pollution from previous tests
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+
+            env_dict = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: "",
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: "",
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER: "",
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER: "",
+            }
+        except ImportError:
+            env_dict = {}
+
+        with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=env_dict) as server:
+            # Test that default ping works
+            ping_response = requests.get(server.url_for("ping"))
+            assert ping_response.status_code == 200
+
+            # Test that default invocations work
+            invoke_response = requests.post(
+                server.url_for("invocations"),
+                json={
+                    "model": MODEL_NAME_SMOLLM,
+                    "messages": [{"role": "user", "content": "Hello"}],
+                    "max_tokens": 5,
+                },
+            )
+            assert invoke_response.status_code == 200
+
+    @pytest.mark.asyncio
+    async def test_handler_env_var_override(self):
+        """Test CUSTOM_FASTAPI_PING_HANDLER and CUSTOM_FASTAPI_INVOCATION_HANDLER
+        environment variable overrides."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Create a script with both env var handlers and script functions
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request, Response
+import json
+
+async def env_var_ping_handler(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "status": "healthy",
+            "source": "env_var_ping",
+            "method": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+async def env_var_invoke_handler(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "predictions": ["Environment variable response"],
+            "source": "env_var_invoke",
+            "method": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "script_ping",
+        "method": "script_function"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Script function response"],
+        "source": "script_invoke",
+        "method": "script_function"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables to override both handlers
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER: (
+                    f"{script_name}:env_var_ping_handler"
+                ),
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER: (
+                    f"{script_name}:env_var_invoke_handler"
+                ),
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test ping handler override
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                # Environment variable should override script function
+                assert ping_data["method"] == "environment_variable"
+                assert ping_data["source"] == "env_var_ping"
+
+                # Test invocation handler override
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Environment variable should override script function
+                assert invoke_data["method"] == "environment_variable"
+                assert invoke_data["source"] == "env_var_invoke"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_env_var_priority_over_decorator_and_script(self):
+        """Test that environment variables have highest priority over decorators
+        and script functions for both ping and invocation handlers."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Create a script with all three handler types for both ping and invocation
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import Request, Response
+import json
+
+# Environment variable handlers (highest priority)
+async def env_priority_ping(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "status": "healthy",
+            "source": "env_var",
+            "priority": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+async def env_priority_invoke(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "predictions": ["Environment variable response"],
+            "source": "env_var",
+            "priority": "environment_variable"
+        }),
+        media_type="application/json"
+    )
+
+# Decorator handlers (medium priority)
+@sagemaker_standards.custom_ping_handler
+async def decorator_ping(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "status": "healthy",
+            "source": "decorator",
+            "priority": "decorator"
+        }),
+        media_type="application/json"
+    )
+
+@sagemaker_standards.custom_invocation_handler
+async def decorator_invoke(raw_request: Request) -> Response:
+    return Response(
+        content=json.dumps({
+            "predictions": ["Decorator response"],
+            "source": "decorator",
+            "priority": "decorator"
+        }),
+        media_type="application/json"
+    )
+
+# Script functions (lowest priority)
+async def custom_sagemaker_ping_handler():
+    return {
+        "status": "healthy",
+        "source": "script",
+        "priority": "script_function"
+    }
+
+async def custom_sagemaker_invocation_handler(request: Request):
+    return {
+        "predictions": ["Script function response"],
+        "source": "script",
+        "priority": "script_function"
+    }
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables to specify highest priority handlers
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+                FastAPIEnvVars.CUSTOM_FASTAPI_PING_HANDLER: (
+                    f"{script_name}:env_priority_ping"
+                ),
+                FastAPIEnvVars.CUSTOM_FASTAPI_INVOCATION_HANDLER: (
+                    f"{script_name}:env_priority_invoke"
+                ),
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test ping handler priority
+                ping_response = requests.get(server.url_for("ping"))
+                assert ping_response.status_code == 200
+                ping_data = ping_response.json()
+
+                # Environment variable has highest priority and should be used
+                assert ping_data["priority"] == "environment_variable"
+                assert ping_data["source"] == "env_var"
+
+                # Test invocation handler priority
+                invoke_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                    },
+                )
+                assert invoke_response.status_code == 200
+                invoke_data = invoke_response.json()
+
+                # Environment variable has highest priority and should be used
+                assert invoke_data["priority"] == "environment_variable"
+                assert invoke_data["source"] == "env_var"
+
+        finally:
+            os.unlink(script_path)
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py b/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
new file mode 100644
index 000000000000..a2867efdc584
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import openai  # use the official async_client for correctness check
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import MODEL_NAME_SMOLLM
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_load_adapter_happy_path(
+    async_client: openai.AsyncOpenAI,
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    # The SageMaker standards library creates a POST /adapters endpoint
+    # that maps to the load_lora_adapter handler with request shape:
+    # {"lora_name": "body.name", "lora_path": "body.src"}
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": "smollm2-lora-sagemaker", "src": smollm2_lora_files},
+    )
+    load_response.raise_for_status()
+
+    models = await async_client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == smollm2_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME_SMOLLM
+    assert dynamic_lora_model.id == "smollm2-lora-sagemaker"
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_unload_adapter_happy_path(
+    async_client: openai.AsyncOpenAI,
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    # First, load an adapter
+    adapter_name = "smollm2-lora-sagemaker-unload"
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": adapter_name, "src": smollm2_lora_files},
+    )
+    load_response.raise_for_status()
+
+    # Verify it's in the models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    assert adapter_name in adapter_ids
+
+    # Now unload it using DELETE /adapters/{adapter_name}
+    # The SageMaker standards maps this to unload_lora_adapter with:
+    # {"lora_name": "path_params.adapter_name"}
+    unload_response = requests.delete(
+        basic_server_with_lora.url_for("adapters", adapter_name),
+    )
+    unload_response.raise_for_status()
+
+    # Verify it's no longer in the models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    assert adapter_name not in adapter_ids
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_load_adapter_not_found(
+    basic_server_with_lora: RemoteOpenAIServer,
+):
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": "nonexistent-adapter", "src": "/path/does/not/exist"},
+    )
+    assert load_response.status_code == 404
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_load_adapter_invalid_files(
+    basic_server_with_lora: RemoteOpenAIServer,
+    tmp_path,
+):
+    invalid_files = tmp_path / "invalid_adapter"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("not valid json")
+
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": "invalid-adapter", "src": str(invalid_files)},
+    )
+    assert load_response.status_code == 400
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_unload_nonexistent_adapter(
+    basic_server_with_lora: RemoteOpenAIServer,
+):
+    # Attempt to unload an adapter that doesn't exist
+    unload_response = requests.delete(
+        basic_server_with_lora.url_for("adapters", "nonexistent-adapter-name"),
+    )
+    assert unload_response.status_code in (400, 404)
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_invocations_with_adapter(
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    # First, load an adapter via SageMaker endpoint
+    adapter_name = "smollm2-lora-invoke-test"
+    load_response = requests.post(
+        basic_server_with_lora.url_for("adapters"),
+        json={"name": adapter_name, "src": smollm2_lora_files},
+    )
+    load_response.raise_for_status()
+
+    # Now test the /invocations endpoint with the adapter
+    invocation_response = requests.post(
+        basic_server_with_lora.url_for("invocations"),
+        headers={
+            "X-Amzn-SageMaker-Adapter-Identifier": adapter_name,
+        },
+        json={
+            "prompt": "Hello, how are you?",
+            "max_tokens": 10,
+        },
+    )
+    invocation_response.raise_for_status()
+    invocation_output = invocation_response.json()
+
+    # Verify we got a valid completion response
+    assert "choices" in invocation_output
+    assert len(invocation_output["choices"]) > 0
+    assert "text" in invocation_output["choices"][0]
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_multiple_adapters_load_unload(
+    async_client: openai.AsyncOpenAI,
+    basic_server_with_lora: RemoteOpenAIServer,
+    smollm2_lora_files,
+):
+    adapter_names = [f"sagemaker-adapter-{i}" for i in range(5)]
+
+    # Load all adapters
+    for adapter_name in adapter_names:
+        load_response = requests.post(
+            basic_server_with_lora.url_for("adapters"),
+            json={"name": adapter_name, "src": smollm2_lora_files},
+        )
+        load_response.raise_for_status()
+
+    # Verify all are in the models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    for adapter_name in adapter_names:
+        assert adapter_name in adapter_ids
+
+    # Unload all adapters
+    for adapter_name in adapter_names:
+        unload_response = requests.delete(
+            basic_server_with_lora.url_for("adapters", adapter_name),
+        )
+        unload_response.raise_for_status()
+
+    # Verify all are removed from models list
+    models = await async_client.models.list()
+    adapter_ids = [model.id for model in models.data]
+    for adapter_name in adapter_names:
+        assert adapter_name not in adapter_ids
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py b/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py
new file mode 100644
index 000000000000..f1ed0c7e2897
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py
@@ -0,0 +1,346 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Integration test for middleware loader functionality.
+
+Tests that customer middlewares get called correctly with a vLLM server.
+"""
+
+import os
+import tempfile
+
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import (
+    MODEL_NAME_SMOLLM,
+)
+
+
+class TestMiddlewareIntegration:
+    """Integration test for middleware with vLLM server."""
+
+    def setup_method(self):
+        """Setup for each test - simulate fresh server startup."""
+        self._clear_caches()
+
+    def _clear_caches(self):
+        """Clear middleware registry and function loader cache."""
+        try:
+            from model_hosting_container_standards.common.fastapi.middleware import (
+                middleware_registry,
+            )
+            from model_hosting_container_standards.common.fastapi.middleware.source.decorator_loader import (  # noqa: E501
+                decorator_loader,
+            )
+            from model_hosting_container_standards.sagemaker.sagemaker_loader import (
+                SageMakerFunctionLoader,
+            )
+
+            middleware_registry.clear_middlewares()
+            decorator_loader.clear()
+            SageMakerFunctionLoader._default_function_loader = None
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+    @pytest.mark.asyncio
+    async def test_customer_middleware_with_vllm_server(self):
+        """Test that customer middlewares work with actual vLLM server.
+
+        Tests decorator-based middlewares (@custom_middleware, @input_formatter,
+        @output_formatter)
+        on multiple endpoints (chat/completions, invocations).
+        """
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a middleware script with multiple decorators
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from model_hosting_container_standards.common.fastapi.middleware import (
+    custom_middleware, input_formatter, output_formatter
+)
+
+# Global flag to track if input formatter was called
+_input_formatter_called = False
+
+@input_formatter
+async def customer_input_formatter(request):
+    # Process input - mark that input formatter was called
+    global _input_formatter_called
+    _input_formatter_called = True
+    return request
+
+@custom_middleware("throttle")
+async def customer_throttle_middleware(request, call_next):
+    response = await call_next(request)
+    response.headers["X-Customer-Throttle"] = "applied"
+    order = response.headers.get("X-Middleware-Order", "")
+    response.headers["X-Middleware-Order"] = order + "throttle,"
+    return response
+
+@output_formatter
+async def customer_output_formatter(response):
+    global _input_formatter_called
+    response.headers["X-Customer-Processed"] = "true"
+    # Since input_formatter and output_formatter are combined into
+    # pre_post_process middleware,
+    # if output_formatter is called, input_formatter should have been called too
+    if _input_formatter_called:
+        response.headers["X-Input-Formatter-Called"] = "true"
+    order = response.headers.get("X-Middleware-Order", "")
+    response.headers["X-Middleware-Order"] = order + "output_formatter,"
+    return response
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables to point to customer script
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test 1: Middlewares applied to chat/completions endpoint
+                chat_response = requests.post(
+                    server.url_for("v1/chat/completions"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                        "temperature": 0.0,
+                    },
+                )
+
+                assert chat_response.status_code == 200
+
+                # Verify all middlewares were executed
+                assert "X-Customer-Throttle" in chat_response.headers
+                assert chat_response.headers["X-Customer-Throttle"] == "applied"
+                assert "X-Customer-Processed" in chat_response.headers
+                assert chat_response.headers["X-Customer-Processed"] == "true"
+
+                # Verify input formatter was called
+                assert "X-Input-Formatter-Called" in chat_response.headers
+                assert chat_response.headers["X-Input-Formatter-Called"] == "true"
+
+                # Verify middleware execution order
+                execution_order = chat_response.headers.get(
+                    "X-Middleware-Order", ""
+                ).rstrip(",")
+                order_parts = execution_order.split(",") if execution_order else []
+                assert "throttle" in order_parts
+                assert "output_formatter" in order_parts
+
+                # Test 2: Middlewares applied to invocations endpoint
+                invocations_response = requests.post(
+                    server.url_for("invocations"),
+                    json={
+                        "model": MODEL_NAME_SMOLLM,
+                        "messages": [{"role": "user", "content": "Hello"}],
+                        "max_tokens": 5,
+                        "temperature": 0.0,
+                    },
+                )
+
+                assert invocations_response.status_code == 200
+
+                # Verify all middlewares were executed
+                assert "X-Customer-Throttle" in invocations_response.headers
+                assert invocations_response.headers["X-Customer-Throttle"] == "applied"
+                assert "X-Customer-Processed" in invocations_response.headers
+                assert invocations_response.headers["X-Customer-Processed"] == "true"
+
+                # Verify input formatter was called
+                assert "X-Input-Formatter-Called" in invocations_response.headers
+                assert (
+                    invocations_response.headers["X-Input-Formatter-Called"] == "true"
+                )
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_middleware_with_ping_endpoint(self):
+        """Test that middlewares work with SageMaker ping endpoint."""
+        try:
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Customer writes a middleware script
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from model_hosting_container_standards.common.fastapi.middleware import (
+    custom_middleware
+)
+
+@custom_middleware("pre_post_process")
+async def ping_tracking_middleware(request, call_next):
+    response = await call_next(request)
+    if request.url.path == "/ping":
+        response.headers["X-Ping-Tracked"] = "true"
+    return response
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                # Test ping endpoint with middleware
+                response = requests.get(server.url_for("ping"))
+
+                assert response.status_code == 200
+                assert "X-Ping-Tracked" in response.headers
+                assert response.headers["X-Ping-Tracked"] == "true"
+
+        finally:
+            os.unlink(script_path)
+
+    @pytest.mark.asyncio
+    async def test_middleware_env_var_override(self):
+        """Test middleware environment variable overrides."""
+        try:
+            from model_hosting_container_standards.common.fastapi.config import (
+                FastAPIEnvVars,
+            )
+            from model_hosting_container_standards.sagemaker.config import (
+                SageMakerEnvVars,
+            )
+        except ImportError:
+            pytest.skip("model-hosting-container-standards not available")
+
+        # Create a script with middleware functions specified via env vars
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+            f.write(
+                """
+from fastapi import Request
+
+# Global flag to track if pre_process was called
+_pre_process_called = False
+
+async def env_throttle_middleware(request, call_next):
+    response = await call_next(request)
+    response.headers["X-Env-Throttle"] = "applied"
+    return response
+
+async def env_pre_process(request: Request) -> Request:
+    # Mark that pre_process was called
+    global _pre_process_called
+    _pre_process_called = True
+    return request
+
+async def env_post_process(response):
+    global _pre_process_called
+    if hasattr(response, 'headers'):
+        response.headers["X-Env-Post-Process"] = "applied"
+        # Since pre_process and post_process are combined into
+        # pre_post_process middleware,
+        # if post_process is called, pre_process should have been called too
+        if _pre_process_called:
+            response.headers["X-Pre-Process-Called"] = "true"
+    return response
+"""
+            )
+            script_path = f.name
+
+        try:
+            script_dir = os.path.dirname(script_path)
+            script_name = os.path.basename(script_path)
+
+            # Set environment variables for middleware
+            # Use script_name with .py extension as per plugin example
+            env_vars = {
+                SageMakerEnvVars.SAGEMAKER_MODEL_PATH: script_dir,
+                SageMakerEnvVars.CUSTOM_SCRIPT_FILENAME: script_name,
+                FastAPIEnvVars.CUSTOM_FASTAPI_MIDDLEWARE_THROTTLE: (
+                    f"{script_name}:env_throttle_middleware"
+                ),
+                FastAPIEnvVars.CUSTOM_PRE_PROCESS: f"{script_name}:env_pre_process",
+                FastAPIEnvVars.CUSTOM_POST_PROCESS: f"{script_name}:env_post_process",
+            }
+
+            args = [
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
+                "--enforce-eager",
+                "--max-num-seqs",
+                "32",
+            ]
+
+            with RemoteOpenAIServer(
+                MODEL_NAME_SMOLLM, args, env_dict=env_vars
+            ) as server:
+                response = requests.get(server.url_for("ping"))
+                assert response.status_code == 200
+
+                # Check if environment variable middleware was applied
+                headers = response.headers
+
+                # Verify that env var middlewares were applied
+                assert "X-Env-Throttle" in headers, (
+                    "Throttle middleware should be applied via env var"
+                )
+                assert headers["X-Env-Throttle"] == "applied"
+
+                assert "X-Env-Post-Process" in headers, (
+                    "Post-process middleware should be applied via env var"
+                )
+                assert headers["X-Env-Post-Process"] == "applied"
+
+                # Verify that pre_process was called
+                assert "X-Pre-Process-Called" in headers, (
+                    "Pre-process should be called via env var"
+                )
+                assert headers["X-Pre-Process-Called"] == "true"
+
+        finally:
+            os.unlink(script_path)
diff --git a/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py b/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py
new file mode 100644
index 000000000000..6206000385bd
--- /dev/null
+++ b/tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import openai  # use the official client for correctness check
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+from .conftest import (
+    HEADER_SAGEMAKER_CLOSED_SESSION_ID,
+    HEADER_SAGEMAKER_NEW_SESSION_ID,
+    HEADER_SAGEMAKER_SESSION_ID,
+    MODEL_NAME_SMOLLM,
+)
+
+CLOSE_BADREQUEST_CASES = [
+    (
+        "nonexistent_session_id",
+        {"session_id": "nonexistent-session-id"},
+        {},
+        "session not found",
+    ),
+    ("malformed_close_request", {}, {"extra-field": "extra-field-data"}, None),
+]
+
+
+@pytest.mark.asyncio
+async def test_create_session_badrequest(basic_server_with_lora: RemoteOpenAIServer):
+    bad_response = requests.post(
+        basic_server_with_lora.url_for("invocations"),
+        json={"requestType": "NEW_SESSION", "extra-field": "extra-field-data"},
+    )
+
+    assert bad_response.status_code == 400
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_name,session_id_change,request_body_change,expected_error",
+    CLOSE_BADREQUEST_CASES,
+)
+async def test_close_session_badrequest(
+    basic_server_with_lora: RemoteOpenAIServer,
+    test_name: str,
+    session_id_change: dict[str, str],
+    request_body_change: dict[str, str],
+    expected_error: str | None,
+):
+    # first attempt to create a session
+    url = basic_server_with_lora.url_for("invocations")
+    create_response = requests.post(url, json={"requestType": "NEW_SESSION"})
+    create_response.raise_for_status()
+    valid_session_id, expiration = create_response.headers.get(
+        HEADER_SAGEMAKER_NEW_SESSION_ID, ""
+    ).split(";")
+    assert valid_session_id
+
+    close_request_json = {"requestType": "CLOSE"}
+    if request_body_change:
+        close_request_json.update(request_body_change)
+    bad_session_id = session_id_change.get("session_id")
+    bad_close_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: bad_session_id or valid_session_id},
+        json=close_request_json,
+    )
+
+    # clean up created session, should succeed
+    clean_up_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json={"requestType": "CLOSE"},
+    )
+    clean_up_response.raise_for_status()
+
+    assert bad_close_response.status_code == 400
+    if expected_error:
+        assert expected_error in bad_close_response.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+async def test_close_session_invalidrequest(
+    basic_server_with_lora: RemoteOpenAIServer, async_client: openai.AsyncOpenAI
+):
+    # first attempt to create a session
+    url = basic_server_with_lora.url_for("invocations")
+    create_response = requests.post(url, json={"requestType": "NEW_SESSION"})
+    create_response.raise_for_status()
+    valid_session_id, expiration = create_response.headers.get(
+        HEADER_SAGEMAKER_NEW_SESSION_ID, ""
+    ).split(";")
+    assert valid_session_id
+
+    close_request_json = {"requestType": "CLOSE"}
+    invalid_close_response = requests.post(
+        url,
+        # no headers to specify session_id
+        json=close_request_json,
+    )
+
+    # clean up created session, should succeed
+    clean_up_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json={"requestType": "CLOSE"},
+    )
+    clean_up_response.raise_for_status()
+
+    assert invalid_close_response.status_code == 424
+    assert "invalid session_id" in invalid_close_response.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+async def test_session(basic_server_with_lora: RemoteOpenAIServer):
+    # first attempt to create a session
+    url = basic_server_with_lora.url_for("invocations")
+    create_response = requests.post(url, json={"requestType": "NEW_SESSION"})
+    create_response.raise_for_status()
+    valid_session_id, expiration = create_response.headers.get(
+        HEADER_SAGEMAKER_NEW_SESSION_ID, ""
+    ).split(";")
+    assert valid_session_id
+
+    # test invocation with session id
+
+    request_args = {
+        "model": MODEL_NAME_SMOLLM,
+        "prompt": "what is 1+1?",
+        "max_completion_tokens": 5,
+        "temperature": 0.0,
+        "logprobs": False,
+    }
+
+    invocation_response = requests.post(
+        basic_server_with_lora.url_for("invocations"),
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json=request_args,
+    )
+    invocation_response.raise_for_status()
+
+    # close created session, should succeed
+    close_response = requests.post(
+        url,
+        headers={HEADER_SAGEMAKER_SESSION_ID: valid_session_id},
+        json={"requestType": "CLOSE"},
+    )
+    close_response.raise_for_status()
+
+    assert (
+        close_response.headers.get(HEADER_SAGEMAKER_CLOSED_SESSION_ID)
+        == valid_session_id
+    )
diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
new file mode 100644
index 000000000000..48bf06088bc0
--- /dev/null
+++ b/tests/entrypoints/test_responses_utils.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.entrypoints.responses_utils import (
+    convert_tool_responses_to_completions_format,
+)
+
+
+class TestResponsesUtils:
+    """Tests for convert_tool_responses_to_completions_format function."""
+
+    def test_convert_tool_responses_to_completions_format(self):
+        """Test basic conversion of a flat tool schema to nested format."""
+        input_tool = {
+            "type": "function",
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string"},
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location", "unit"],
+            },
+        }
+
+        result = convert_tool_responses_to_completions_format(input_tool)
+
+        assert result == {"type": "function", "function": input_tool}
diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
index ea9c95158405..9297bf6ddf2d 100644
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -3,6 +3,3 @@ accuracy_threshold: 0.45
 num_questions: 1319
 num_fewshot: 5
 max_model_len: 4096
-# Duo stream incompatabilbe with this model: https://github.com/vllm-project/vllm/issues/28220
-env:
-  VLLM_DISABLE_SHARED_EXPERTS_STREAM: "1"
diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py
index c7799607912b..0421f8bb1859 100644
--- a/tests/evals/gsm8k/gsm8k_eval.py
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@@ -83,8 +83,12 @@ async def call_vllm_api(
     stop: list[str] | None = None,
     url: str | None = None,
     seed: int | None = None,
-) -> str:
-    """Call vLLM's OpenAI-compatible completions endpoint."""
+) -> tuple[str, int]:
+    """Call vLLM's OpenAI-compatible completions endpoint.
+
+    Returns:
+        Tuple of (response_text, completion_tokens)
+    """
     data = {
         "prompt": prompt,
         "temperature": temperature,
@@ -98,10 +102,12 @@ async def call_vllm_api(
         async with session.post(f"{url}/v1/completions", json=data) as response:
             response.raise_for_status()
             result = await response.json()
-            return result["choices"][0]["text"]
+            text = result["choices"][0]["text"]
+            completion_tokens = result.get("usage", {}).get("completion_tokens", 0)
+            return text, completion_tokens
     except Exception as e:
         print(f"Error calling vLLM API: {e}")
-        return ""
+        return "", 0
 
 
 def evaluate_gsm8k(
@@ -146,10 +152,11 @@ def evaluate_gsm8k(
     # Run evaluation
     async def run_async_evaluation():
         states: list[str] = [""] * num_questions
+        output_tokens: list[int] = [0] * num_questions
 
-        async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
+        async def get_answer(session: aiohttp.ClientSession, i: int) -> tuple[str, int]:
             prompt = few_shot_examples + questions[i]
-            answer = await call_vllm_api(
+            answer, tokens = await call_vllm_api(
                 session=session,
                 prompt=prompt,
                 temperature=temperature,
@@ -159,7 +166,8 @@ async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
                 seed=seed,
             )
             states[i] = answer
-            return answer
+            output_tokens[i] = tokens
+            return answer, tokens
 
         async with aiohttp.ClientSession(
             timeout=aiohttp.ClientTimeout(total=600)
@@ -167,24 +175,28 @@ async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
             tasks = [get_answer(session, i) for i in range(num_questions)]
             await tqdm.gather(*tasks, desc="Evaluating")
 
-        return states
+        return states, output_tokens
 
     print(f"Running GSM8K evaluation: {num_questions} questions, {num_shots}-shot")
 
     tic = time.perf_counter()
-    states = asyncio.run(run_async_evaluation())
+    states, output_tokens = asyncio.run(run_async_evaluation())
     latency = time.perf_counter() - tic
 
     # Compute metrics
     preds = [get_answer_value(state) for state in states]
     accuracy = np.mean(np.array(preds) == np.array(labels))
     invalid_rate = np.mean(np.array(preds) == INVALID)
+    total_output_tokens = sum(output_tokens)
+    tokens_per_second = total_output_tokens / latency if latency > 0 else 0.0
 
     result = {
         "accuracy": accuracy,
         "invalid_rate": invalid_rate,
         "latency": latency,
         "questions_per_second": num_questions / latency,
+        "total_output_tokens": total_output_tokens,
+        "tokens_per_second": tokens_per_second,
         "num_questions": num_questions,
         "num_shots": num_shots,
         "max_tokens": max_tokens,
@@ -236,6 +248,8 @@ def main() -> None:
     print(f"Invalid responses: {result['invalid_rate']:.3f}")
     print(f"Total latency: {result['latency']:.3f} s")
     print(f"Questions per second: {result['questions_per_second']:.3f}")
+    print(f"Total output tokens: {result['total_output_tokens']}")
+    print(f"Output tokens per second: {result['tokens_per_second']:.3f}")
 
     # Optional file saving
     if args.save_results:
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 8149ce7672cd..3b8e939300a2 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -35,7 +35,7 @@ def clear_cache():
 DEVICE_REGULAR_ATTN_BACKENDS = {
     "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
     "hip": ["ROCM_ATTN"],
-    "cpu": ["TORCH_SDPA"],
+    "cpu": ["CPU_ATTN"],
 }
 
 DEVICE_MLA_BLOCK_SIZES = {
@@ -86,7 +86,7 @@ def test_env(
         if device == "cpu":
             with patch("vllm.platforms.current_platform", CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, None, block_size)
-            assert backend.get_name() == "TORCH_SDPA"
+            assert backend.get_name() == "CPU_ATTN"
 
         elif device == "hip":
             with patch("vllm.platforms.current_platform", RocmPlatform()):
@@ -120,12 +120,13 @@ def test_env(
 
         elif device == "cuda":
             with patch("vllm.platforms.current_platform", CudaPlatform()):
+                capability = torch.cuda.get_device_capability()
                 if use_mla:
                     # CUDA MLA backend logic:
                     # - CUTLASS_MLA: only supported with block_size == 128
-                    #   and Blackwell GPUs (SM 10.0), V1 only
+                    #   and Blackwell GPUs (SM 10.x), V1 only
                     # - FLASHINFER_MLA: only supported on Blackwell GPUs
-                    #   (SM 10.0+), V1 only
+                    #   (SM 10.x), V1 only
                     # - FLASHMLA: only supported with block_size == 64
                     # - FLASH_ATTN_MLA: V1 only
                     # - TRITON_MLA: fallback for other cases
@@ -134,58 +135,72 @@ def test_env(
                         if block_size != 128:
                             # CUTLASS_MLA only supports block_size == 128
                             pytest.skip("CUTLASS_MLA only supports block_size 128")
-                        else:
-                            backend = get_attn_backend(
-                                16, torch.float16, None, block_size, use_mla=use_mla
-                            )
-                            expected = "CUTLASS_MLA"
-                            assert backend.get_name() == expected
+                        if capability[0] != 10:
+                            pytest.skip("CUTLASS MLA is not supported on this platform")
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "CUTLASS_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHINFER_MLA":
+                        if capability[0] != 10:
+                            pytest.skip(
+                                "FlashInfer MLA is not supported on this platform"
+                            )
                         if block_size not in [32, 64]:
                             # FlashInfer MLA only supports block_size 32 or 64
                             pytest.skip(
                                 "FlashInfer MLA only supports block_size 32 or 64"
                             )
-                        else:
-                            backend = get_attn_backend(
-                                16, torch.float16, None, block_size, use_mla=use_mla
-                            )
-                            expected = "FLASHINFER_MLA"
-                            assert backend.get_name() == expected
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "FLASHINFER_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHMLA":
                         if block_size != 64:
                             # FlashMLA only supports block_size == 64
                             pytest.skip("FlashMLA only supports block_size 64")
-                        else:
-                            from vllm.v1.attention.backends.mla.flashmla import (
-                                is_flashmla_dense_supported,
-                            )
+                        from vllm.v1.attention.backends.mla.flashmla import (
+                            is_flashmla_dense_supported,
+                        )
 
-                            is_supported, _ = is_flashmla_dense_supported()
-                            if not is_supported:
-                                pytest.skip("FlashMLA not supported on this platform")
-                            else:
-                                backend = get_attn_backend(
-                                    16, torch.float16, None, block_size, use_mla=use_mla
-                                )
-                                expected = name
-                                assert backend.get_name() == expected
+                        is_supported, _ = is_flashmla_dense_supported()
+                        if not is_supported:
+                            pytest.skip("FlashMLA not supported on this platform")
+                        backend = get_attn_backend(
+                            576,
+                            torch.float16,
+                            None,
+                            block_size,
+                            use_mla=use_mla,
+                        )
+                        expected = name
+                        assert backend.get_name() == expected
                     elif name == "FLASH_ATTN_MLA":
+                        from vllm.attention.utils.fa_utils import (
+                            flash_attn_supports_mla,
+                        )
+
+                        if not flash_attn_supports_mla():
+                            pytest.skip(
+                                "FlashAttention MLA not supported on this platform"
+                            )
                         backend = get_attn_backend(
-                            16, torch.float16, None, block_size, use_mla=use_mla
+                            576, torch.float16, None, block_size, use_mla=use_mla
                         )
                         expected = "FLASH_ATTN_MLA"
                         assert backend.get_name() == expected
                     else:
                         # TRITON_MLA or other fallback
                         backend = get_attn_backend(
-                            16, torch.float16, None, block_size, use_mla=use_mla
+                            576, torch.float16, None, block_size, use_mla=use_mla
                         )
                         expected = "TRITON_MLA"
                         assert backend.get_name() == expected
                 elif name == "FLASHINFER":
                     backend = get_attn_backend(
-                        16, torch.float16, None, block_size, use_mla=use_mla
+                        64, torch.float16, None, block_size, use_mla=use_mla
                     )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected
@@ -209,7 +224,7 @@ def test_fp32_fallback(device: str):
     if device == "cpu":
         with patch("vllm.platforms.current_platform", CpuPlatform()):
             backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "TORCH_SDPA"
+        assert backend.get_name() == "CPU_ATTN"
 
     elif device == "cuda":
         with patch("vllm.platforms.current_platform", CudaPlatform()):
diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index 4295f852f95b..20f573821b25 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -170,6 +170,7 @@ def test_cascade(
         logits_soft_cap=soft_cap if soft_cap is not None else 0,
         block_table=block_tables,
         common_prefix_len=common_prefix_len,
+        max_num_splits=0,  # no max
         fa_version=fa_version,
     )
 
diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py
new file mode 100644
index 000000000000..fb3b1799ba48
--- /dev/null
+++ b/tests/kernels/attention/test_cpu_attn.py
@@ -0,0 +1,575 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import math
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+from vllm._custom_ops import (
+    cpu_attention_with_kv_cache,
+    cpu_attn_get_scheduler_metadata,
+    cpu_attn_reshape_and_cache,
+)
+
+NUM_HEADS = [
+    (4, 4),
+    (8, 2),
+    (9, 3),
+]
+HEAD_SIZES = [96, 128]
+QTYPES = [torch.bfloat16, torch.half, torch.float32]
+SLIDING_WINDOWS = [None, 256]
+NUM_BLOCKS = [
+    1024,
+]
+SEQ_LENS = [  # (q_len, kv_len)
+    [(1, 213), (1, 1), (1, 312), (1, 7), (1, 7812)],  # decode batch
+    [(2345, 2345), (5, 5), (3, 16), (134, 5131)],  # prefill batch
+    [(992, 2456), (1, 1234), (98, 1145), (1, 4162), (2345, 2345)],  # mixed batch
+]
+
+
+# rand number generation takes too much time, cache rand tensors
+@functools.lru_cache(maxsize=128, typed=False)
+def tensor_cache(
+    elem_num: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    tensor = torch.randn(elem_num, dtype=dtype)
+
+    return tensor
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(
+            closest_power_of_2, total_num_heads - closest_power_of_2
+        )
+        extra_powers = torch.arange(
+            start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes.float()
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
+    alibi_slopes: torch.Tensor | None = None,
+    s_aux: torch.Tensor | None = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+    dtype = query.dtype
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+
+    if alibi_slopes is not None:
+        alibi_slopes = alibi_slopes[:, None, None]
+
+    if s_aux is not None:
+        s_aux = s_aux.float()
+        s_aux = s_aux[:, None, None]
+
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx : start_idx + query_len].float()
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len].float()
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len].float()
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+
+        if sliding_window is not None:
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
+            mask |= sliding_window_mask
+
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+
+        if alibi_slopes is not None:
+            q_start_pos = kv_len - query_len
+            q_pos = q_start_pos + torch.arange(0, query_len)[None, :, None]
+            kv_pos = torch.arange(0, kv_len)[None, None, :]
+            dist = q_pos - kv_pos
+            alibi_bias = -alibi_slopes * dist
+            attn += alibi_bias
+
+        attn.masked_fill_(mask, float("-inf"))
+
+        if s_aux is not None:
+            s_aux_ext = s_aux.repeat(1, query_len, 1)
+            attn = torch.cat((s_aux_ext, attn), dim=-1)
+
+        attn = torch.softmax(attn, dim=-1)
+
+        if s_aux is not None:
+            attn = attn[:, :, 1:]
+
+        out = torch.einsum("hqk,khd->qhd", attn, v).to(dtype=dtype)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@torch.inference_mode()
+def varlen_with_paged_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+    scale = head_size**-0.5
+    token_num = sum(query_lens)
+
+    # for n heads the set of slopes is the geometric sequence that starts
+    # 2^(-8/n)
+    alibi_slopes = _get_alibi_slopes(num_query_heads) if use_alibi else None
+
+    s_aux = (
+        15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None
+    )
+
+    query = tensor_cache(
+        elem_num=token_num * num_query_heads * head_size,
+        dtype=dtype,
+    )
+    query = query.view(
+        token_num,
+        num_query_heads,
+        head_size,
+    )
+
+    key_value = tensor_cache(
+        elem_num=2 * num_blocks * num_kv_heads * block_size * head_size,
+        dtype=dtype,
+    )
+    key_value = key_value.view(
+        2,
+        num_blocks,
+        block_size,
+        num_kv_heads,
+        head_size,
+    )
+    key_cache, value_cache = key_value.unbind(0)
+
+    # KV cache for CPU attention
+    packed_key_cache = torch.empty(
+        num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
+    )
+    packed_value_cache = torch.empty_like(packed_key_cache)
+
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    # use reshape_and_cache to pack key_cache and value_cache
+    slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64)
+    cpu_attn_reshape_and_cache(
+        key=key_cache.view(-1, num_kv_heads, head_size),
+        value=value_cache.view(-1, num_kv_heads, head_size),
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        slot_mapping=slot_mapping,
+        isa=isa,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=False,
+    )
+
+    out_without_split = torch.empty_like(query)
+    cpu_attention_with_kv_cache(
+        query=query,
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        output=out_without_split,
+        query_start_loc=cu_query_lens,
+        seq_lens=kv_lens_tensor,
+        scale=scale,
+        causal=True,
+        alibi_slopes=alibi_slopes,
+        sliding_window=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=metadata,
+        s_aux=s_aux,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=True,
+    )
+
+    out_with_split = torch.empty_like(query)
+    cpu_attention_with_kv_cache(
+        query=query,
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        output=out_with_split,
+        query_start_loc=cu_query_lens,
+        seq_lens=kv_lens_tensor,
+        scale=scale,
+        causal=True,
+        alibi_slopes=alibi_slopes,
+        sliding_window=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=metadata,
+        s_aux=s_aux,
+    )
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+        alibi_slopes=alibi_slopes,
+        s_aux=s_aux,
+    )
+
+    atol, rtol = 1.5e-2, 1e-2
+    (
+        torch.testing.assert_close(out_with_split, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(out_with_split - ref_output))}",
+    )
+    (
+        torch.testing.assert_close(out_without_split, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(out_without_split - ref_output))}",
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", QTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["vec"])
+def test_varlen_with_paged_kv_normal_vec(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["amx"])
+@pytest.mark.skipif(
+    not torch._C._cpu._is_amx_tile_supported(), reason="no AMX support."
+)
+def test_varlen_with_paged_kv_normal_amx(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [48])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["vec16"])
+def test_varlen_with_paged_kv_normal_vec16(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [50])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize(
+    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+)
+def test_varlen_with_paged_kv_softcap(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [True])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize(
+    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+)
+def test_varlen_with_paged_kv_alibi(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [True])
+@pytest.mark.parametrize(
+    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+)
+def test_varlen_with_paged_kv_sink(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 18995545552e..6e5468969bf2 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -9,7 +9,6 @@
 from vllm.vllm_flash_attn import (
     fa_version_unsupported_reason,
     flash_attn_varlen_func,
-    flash_attn_with_kvcache,
     is_fa_version_supported,
 )
 
@@ -83,124 +82,6 @@ def ref_paged_attn(
     return torch.cat(outputs, dim=0)
 
 
-@pytest.mark.parametrize("use_out", [True, False])
-@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
-@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
-@pytest.mark.parametrize("fa_version", [2, 3])
-@pytest.mark.parametrize("q_dtype", QDTYPES)
-@torch.inference_mode()
-def test_flash_attn_with_paged_kv(
-    use_out: bool,
-    kv_lens: list[int],
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    block_size: int,
-    soft_cap: float | None,
-    num_blocks: int,
-    sliding_window: int | None,
-    fa_version: int,
-    q_dtype: torch.dtype | None,
-) -> None:
-    torch.set_default_device("cuda")
-    if not is_fa_version_supported(fa_version):
-        pytest.skip(
-            f"Flash attention version {fa_version} not supported due "
-            f'to: "{fa_version_unsupported_reason(fa_version)}"'
-        )
-    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
-        pytest.skip(
-            "Flash attention with quantized inputs is only "
-            "supported on version 3 with bfloat16 base type"
-        )
-
-    current_platform.seed_everything(0)
-    num_seqs = len(kv_lens)
-    num_query_heads = num_heads[0]
-    num_kv_heads = num_heads[1]
-    assert num_query_heads % num_kv_heads == 0
-    max_kv_len = max(kv_lens)
-    scale = head_size**-0.5
-    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
-
-    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
-    key_cache = torch.randn(
-        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
-    )
-    value_cache = torch.randn_like(key_cache)
-    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
-
-    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(
-        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
-    )
-
-    q = query.unsqueeze(1)
-    out = torch.empty_like(q) if use_out else None
-
-    maybe_quantized_query = q
-    maybe_quantized_key_cache = key_cache
-    maybe_quantized_value_cache = value_cache
-    q_descale = None
-    k_descale = None
-    v_descale = None
-    if q_dtype is not None:
-        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
-        maybe_quantized_query = q.to(q_dtype)
-        maybe_quantized_key_cache = key_cache.to(q_dtype)
-        maybe_quantized_value_cache = value_cache.to(q_dtype)
-
-        scale_shape = (num_seqs, num_kv_heads)
-        q_descale = torch.ones(scale_shape, dtype=torch.float32)
-        k_descale = torch.ones(scale_shape, dtype=torch.float32)
-        v_descale = torch.ones(scale_shape, dtype=torch.float32)
-
-    output = flash_attn_with_kvcache(
-        q=maybe_quantized_query,
-        k_cache=maybe_quantized_key_cache,
-        v_cache=maybe_quantized_value_cache,
-        out=out,
-        softmax_scale=scale,
-        causal=True,
-        block_table=block_tables,
-        cache_seqlens=kv_lens_tensor,
-        softcap=soft_cap if soft_cap is not None else 0,
-        window_size=window_size,
-        fa_version=fa_version,
-        q_descale=q_descale,
-        k_descale=k_descale,
-        v_descale=v_descale,
-    )
-    output = output if not use_out else out
-    output = output.squeeze(1)
-
-    atol, rtol = 1.5e-2, 1e-2
-    if q_dtype is not None:
-        atol, rtol = 1.5e-1, 1.5e-1
-
-    ref_output = ref_paged_attn(
-        query=query,
-        key_cache=key_cache,
-        value_cache=value_cache,
-        query_lens=[1] * num_seqs,
-        kv_lens=kv_lens,
-        block_tables=block_tables,
-        scale=scale,
-        soft_cap=soft_cap,
-        sliding_window=sliding_window,
-    )
-    (
-        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
-        f"{torch.max(torch.abs(output - ref_output))}",
-    )
-
-
 @pytest.mark.parametrize("use_out", [True, False])
 @pytest.mark.parametrize(
     "seq_lens", [[(1, 1328), (5, 18), (129, 463)], [(1, 523), (1, 37), (1, 2011)]]
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 14d1618bca3c..183bbf3bf4e0 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -11,7 +11,7 @@
 import pytest
 import torch
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import MultiHeadAttention
 from vllm.attention.selector import _cached_get_attn_backend
 from vllm.platforms import current_platform
@@ -43,14 +43,14 @@ def test_mha_attn_platform(device: str):
             patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()),
         ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.TORCH_SDPA
+            assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
     elif device == "hip":
         with (
             patch("vllm.attention.layer.current_platform", RocmPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
         ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.TORCH_SDPA
+            assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
     else:
         # Test CUDA with head_size=64 (divisible by 32)
         # - should use vLLM's FlashAttention
@@ -59,7 +59,7 @@ def test_mha_attn_platform(device: str):
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
         ):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.FLASH_ATTN
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
         # Test CUDA with head_size=72 (not divisible by 32)
         # - with upstream FA not available
@@ -73,7 +73,7 @@ def test_mha_attn_platform(device: str):
             ),
         ):
             attn = MultiHeadAttention(16, 72, scale=1)
-            assert attn.attn_backend == _Backend.XFORMERS
+            assert attn.attn_backend == AttentionBackendEnum.XFORMERS
 
         # Test CUDA with head_size=72 (not divisible by 32)
         # - with upstream FA available
@@ -96,7 +96,7 @@ def test_mha_attn_platform(device: str):
             ),
         ):
             attn = MultiHeadAttention(16, 72, scale=1)
-            assert attn.attn_backend == _Backend.FLASH_ATTN
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
 
 def ref_attention(
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 65972d02f2f6..78cdbbbf7379 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -8,10 +8,8 @@
 
 import pytest
 import torch
-from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
+import torch.nn.functional as F
 
-from tests.kernels.utils import make_alibi_bias
 from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.platforms import current_platform
@@ -28,6 +26,74 @@
 OPS = [chunked_prefill_paged_decode, context_attention_fwd]
 
 
+def create_causal_attention_mask_for_sdpa(
+    query_lens: list[int],
+    seq_lens: list[int],
+    sliding_window: int = 0,
+    device: torch.device = None,
+    dtype: torch.dtype = None,
+) -> torch.Tensor:
+    total_queries = sum(query_lens)
+    total_keys = sum(seq_lens)
+
+    # Create a mask filled with -inf
+    mask = torch.full(
+        (total_queries, total_keys), float("-inf"), device=device, dtype=dtype
+    )
+
+    query_start = 0
+    key_start = 0
+
+    for query_len, seq_len in zip(query_lens, seq_lens):
+        query_end = query_start + query_len
+        key_end = key_start + seq_len
+        q_indices = torch.arange(query_len, device=device)
+        k_indices = torch.arange(seq_len, device=device)
+        q_pos_in_seq = seq_len - query_len + q_indices
+
+        valid_mask = k_indices[None, :] <= q_pos_in_seq[:, None]
+
+        if sliding_window > 0:
+            valid_mask &= k_indices[None, :] >= (
+                q_pos_in_seq[:, None] - sliding_window + 1
+            )
+
+        mask[query_start:query_end, key_start:key_end][valid_mask] = 0.0
+
+        query_start = query_end
+        key_start = key_end
+
+    return mask
+
+
+def create_alibi_causal_mask(
+    query_len: int,
+    seq_len: int,
+    alibi_slopes: torch.Tensor,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    query_pos = torch.arange(
+        seq_len - query_len, seq_len, device=device, dtype=torch.float32
+    )
+    key_pos = torch.arange(seq_len, device=device, dtype=torch.float32)
+
+    rel_pos = key_pos[None, :] - query_pos[:, None]
+
+    # Apply ALiBi slopes: [num_heads, query_len, seq_len]
+    alibi_bias = alibi_slopes[:, None, None] * rel_pos[None, :, :]
+    alibi_bias = alibi_bias.to(dtype)
+
+    # Apply causal mask: prevent attending to future positions
+    # causal_mask[i, j] = True if key_pos[j] <= query_pos[i]
+    causal_mask = key_pos[None, :] <= query_pos[:, None]
+    alibi_bias = alibi_bias.masked_fill(~causal_mask[None, :, :], float("-inf"))
+
+    # Add batch dimension: [1, num_heads, query_len, seq_len]
+    # SDPA expects batch dimension even for single sequences
+    return alibi_bias.unsqueeze(0)
+
+
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -52,6 +118,13 @@ def test_contexted_kv_attention(
             "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
         )
 
+    if (
+        current_platform.is_rocm()
+        and op is chunked_prefill_paged_decode
+        and kv_cache_dtype == "fp8_e5m2"
+    ):
+        pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
+
     current_platform.seed_everything(0)
     torch.set_default_device(device)
 
@@ -96,16 +169,16 @@ def test_contexted_kv_attention(
     )
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = torch.arange(0, cache_size, dtype=torch.int32)
     values = values[torch.randperm(cache_size)]
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
-    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.long), dim=0)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
     b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.long), dim=0
+        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
     )
     for i in range(BS):
         for j in range(query_lens[i]):
@@ -189,56 +262,57 @@ def test_contexted_kv_attention(
 
     scale = float(1.0 / (head_size**0.5))
 
-    attn_op = xops.fmha.cutlass.FwOp()
+    # Reshape for SDPA: (seq_len, num_heads, head_size) ->
+    # (1, num_heads, seq_len, head_size)
+    query_sdpa = query.view(num_tokens, num_kv_heads, num_queries_per_kv, head_size)
+    query_sdpa = query_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, num_tokens, head_size
+    )
 
-    if num_kv_heads != num_heads:
-        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
-        # project the key and value tensors to the desired number of
-        # heads.
-        #
-        # see also: vllm/model_executor/layers/attention.py
-        query = query.view(
-            query.shape[0], num_kv_heads, num_queries_per_kv, query.shape[-1]
-        )
-        key = key[:, :, None, :].expand(
-            key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
-        )
-        value = value[:, :, None, :].expand(
-            value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
-        )
-    query = query.unsqueeze(0)
-    key = key.unsqueeze(0)
-    value = value.unsqueeze(0)
+    # Expand key and value for GQA/MQA to match query heads
+    key_sdpa = key[:, :, None, :].expand(
+        key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+    )
+    key_sdpa = key_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, sum(seq_lens), head_size
+    )
 
-    attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-        query_lens, seq_lens
+    value_sdpa = value[:, :, None, :].expand(
+        value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
     )
-    if sliding_window > 0:
-        attn_bias = attn_bias.make_local_attention_from_bottomright(sliding_window)
-    output_ref = xops.memory_efficient_attention_forward(
-        query,
-        key,
-        value,
-        attn_bias=attn_bias,
-        p=0.0,
+    value_sdpa = value_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, sum(seq_lens), head_size
+    )
+
+    attn_mask = create_causal_attention_mask_for_sdpa(
+        query_lens, seq_lens, sliding_window, device=device, dtype=dtype
+    )
+
+    output_ref = F.scaled_dot_product_attention(
+        query_sdpa,
+        key_sdpa,
+        value_sdpa,
+        attn_mask=attn_mask,
+        dropout_p=0.0,
         scale=scale,
-        op=attn_op,
     )
     torch.cuda.synchronize()
     start_time = time.time()
-    output_ref = xops.memory_efficient_attention_forward(
-        query,
-        key,
-        value,
-        attn_bias=attn_bias,
-        p=0.0,
+    output_ref = F.scaled_dot_product_attention(
+        query_sdpa,
+        key_sdpa,
+        value_sdpa,
+        attn_mask=attn_mask,
+        dropout_p=0.0,
         scale=scale,
-        op=attn_op,
     )
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"xformers Time: {(end_time - start_time) * 1000:.2f} ms")
-    output_ref = output_ref.reshape(output.shape)
+    print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
+
+    # Reshape output back to (num_tokens, num_heads, head_size)
+    output_ref = output_ref.view(num_heads, num_tokens, head_size)
+    output_ref = output_ref.permute(1, 0, 2).contiguous()
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-4
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
@@ -265,6 +339,13 @@ def test_contexted_kv_attention_alibi(
             "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
         )
 
+    if (
+        current_platform.is_rocm()
+        and op is chunked_prefill_paged_decode
+        and kv_cache_dtype == "fp8_e5m2"
+    ):
+        pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
+
     current_platform.seed_everything(0)
     torch.set_default_device(device)
 
@@ -331,16 +412,16 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     )
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = torch.arange(0, cache_size, dtype=torch.int32)
     values = values[torch.randperm(cache_size)]
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
-    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.long), dim=0)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
     b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.long), dim=0
+        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
     )
     for i in range(BS):
         for j in range(query_lens[i]):
@@ -423,78 +504,75 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
     scale = float(1.0 / (head_size**0.5))
 
-    # NOTE(DefTruth): In order to reuse _make_alibi_bias function,
-    # we have to pad query tensor before MQA/GQA expanding.
-    if query.shape[0] != key.shape[0]:
-        query_pad = torch.empty(sum(seq_lens), num_heads, head_size, dtype=dtype)
-        query_pad.uniform_(-1e-3, 1e-3)
-        seq_start = 0
-        query_start = 0
-        for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
-            seq_end = seq_start + seq_len
-            query_end = query_start + query_len
-            query_pad[seq_start:seq_end, ...] = torch.cat(
-                [
-                    torch.zeros(seq_len - query_len, num_heads, head_size, dtype=dtype),
-                    query[query_start:query_end, ...],
-                ],
-                dim=0,
-            )
-            seq_start += seq_len
-            query_start += query_len
-        query = query_pad
-
-    if num_kv_heads != num_heads:
-        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
-        # project the key and value tensors to the desired number of
-        # heads.
-        #
-        # see also: vllm/model_executor/layers/attention.py
-        key = key[:, :, None, :].expand(
-            key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
-        )
-        value = value[:, :, None, :].expand(
-            value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
-        )
-        # [seq, num_kv_heads, num_queries_per_kv, dk]=>
-        # [seq, num_kv_heads*num_queries_per_kv, dk] to comply with rest of the
-        # codebase. We save some time reshaping alibi matrix at runtime.
-        key = key.reshape(key.shape[0], -1, key.shape[-1])
-        value = value.reshape(value.shape[0], -1, value.shape[-1])
-    query = query.unsqueeze(0)
-    key = key.unsqueeze(0)
-    value = value.unsqueeze(0)
-
-    attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
+    # Prepare query, key, value for SDPA
+    # Expand key and value for GQA/MQA to match query heads
+    key_expanded = key[:, :, None, :].expand(
+        key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+    )
+    value_expanded = value[:, :, None, :].expand(
+        value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
+    )
+
     output_ref = torch.empty_like(output)
-    seq_start = 0
-    query_start = 0
+
+    torch.cuda.synchronize()
     start_time = time.time()
-    # Attention with alibi slopes.
-    # FIXME(DefTruth): Because xformers does not support dynamic sequence
-    # lengths with custom attention bias, we process each prompt one by
-    # one. This is inefficient, especially when we have many short prompts.
-    # modified from: vllm/v1/attention/backends/xformers.py#L343
+
+    query_start = 0
+    key_start = 0
     for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
-        seq_end = seq_start + seq_len
         query_end = query_start + query_len
-        out = xops.memory_efficient_attention_forward(
-            query[:, seq_start:seq_end],
-            key[:, seq_start:seq_end],
-            value[:, seq_start:seq_end],
-            attn_bias=attn_bias[i],
-            p=0.0,
-            scale=scale,
+        key_end = key_start + seq_len
+
+        # Get query, key, value for this sequence
+        q = query[query_start:query_end]  # [query_len, num_heads, head_size]
+        k = key_expanded[
+            key_start:key_end
+        ]  # [seq_len, num_kv_heads, num_queries_per_kv, head_size]
+        v = value_expanded[
+            key_start:key_end
+        ]  # [seq_len, num_kv_heads, num_queries_per_kv, head_size]
+
+        # Reshape for SDPA: (batch=1, num_heads, seq_len, head_size)
+        q_sdpa = q.view(query_len, num_kv_heads, num_queries_per_kv, head_size)
+        q_sdpa = (
+            q_sdpa.permute(1, 2, 0, 3)
+            .reshape(1, num_heads, query_len, head_size)
+            .contiguous()
+        )
+
+        k_sdpa = (
+            k.permute(1, 2, 0, 3).reshape(1, num_heads, seq_len, head_size).contiguous()
+        )
+        v_sdpa = (
+            v.permute(1, 2, 0, 3).reshape(1, num_heads, seq_len, head_size).contiguous()
         )
-        out = out.view_as(query[:, seq_start:seq_end]).view(
-            seq_len, num_heads, head_size
+
+        # Create ALiBi causal mask for this sequence using utility function
+        alibi_mask = create_alibi_causal_mask(
+            query_len, seq_len, alibi_slopes, device, dtype
+        )
+
+        # Compute attention
+        out = F.scaled_dot_product_attention(
+            q_sdpa,
+            k_sdpa,
+            v_sdpa,
+            attn_mask=alibi_mask,
+            dropout_p=0.0,
+            scale=scale,
         )
-        output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len :, ...])
-        seq_start += seq_len
-        query_start += query_len
+
+        # Reshape output back to [query_len, num_heads, head_size]
+        out = out.view(num_heads, query_len, head_size).permute(1, 0, 2)
+        output_ref[query_start:query_end].copy_(out)
+
+        query_start = query_end
+        key_start = key_end
+
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"xformers Time: {(end_time - start_time) * 1000:.2f} ms")
+    print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py
new file mode 100644
index 000000000000..a23959e353da
--- /dev/null
+++ b/tests/kernels/core/test_fused_qk_norm_rope.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+
+DTYPES = [torch.bfloat16, torch.float16]
+IS_NEOX = [True, False]
+EPS_VALUES = [1e-5, 1e-6]
+SEEDS = [13]
+CUDA_DEVICES = ["cuda:0"]
+
+
+def _apply_qk_norm_rope(
+    qkv: torch.Tensor,
+    positions: torch.Tensor,
+    q_norm: RMSNorm,
+    k_norm: RMSNorm,
+    rope: RotaryEmbedding,
+    num_heads_q: int,
+    num_heads_kv: int,
+    head_dim: int,
+) -> torch.Tensor:
+    q_size = num_heads_q * head_dim
+    kv_size = num_heads_kv * head_dim
+
+    q, k, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
+
+    q_by_head = q.view(*q.shape[:-1], q.shape[-1] // head_dim, head_dim)
+    q_by_head = q_norm.forward_native(q_by_head)
+    q = q_by_head.view(q.shape)
+
+    k_by_head = k.view(*k.shape[:-1], k.shape[-1] // head_dim, head_dim)
+    k_by_head = k_norm.forward_native(k_by_head)
+    k = k_by_head.view(k.shape)
+
+    q, k = rope.forward_native(positions, q, k)
+    return torch.cat([q, k, v], dim=-1)
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="fused_qk_norm_rope custom op requires cuda and rocm platform",
+)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("is_neox", IS_NEOX)
+@pytest.mark.parametrize("eps", EPS_VALUES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_fused_qk_norm_rope_matches_reference(
+    device: str,
+    dtype: torch.dtype,
+    is_neox: bool,
+    eps: float,
+    seed: int,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+    num_heads, num_kv_heads, head_dim = 16, 4, 128
+    num_tokens = 4
+
+    total_dim = (num_heads + 2 * num_kv_heads) * head_dim
+    qkv_base = torch.randn(num_tokens, total_dim, dtype=dtype, device=device)
+    qkv_fused = qkv_base.clone()
+    positions = torch.arange(num_tokens, dtype=torch.long, device=device)
+
+    q_norm = RMSNorm(head_dim, eps=eps).to(device=device, dtype=dtype)
+    k_norm = RMSNorm(head_dim, eps=eps).to(device=device, dtype=dtype)
+    q_norm.weight.data.normal_(mean=1.0, std=0.1)
+    k_norm.weight.data.normal_(mean=1.0, std=0.1)
+    q_weight = q_norm.weight.data
+    k_weight = k_norm.weight.data
+
+    rope = RotaryEmbedding(
+        head_size=head_dim,
+        rotary_dim=head_dim,
+        max_position_embeddings=4096,
+        base=10000.0,
+        is_neox_style=is_neox,
+        dtype=dtype,
+    ).to(device)
+
+    ref_result = _apply_qk_norm_rope(
+        qkv=qkv_base,
+        positions=positions,
+        q_norm=q_norm,
+        k_norm=k_norm,
+        rope=rope,
+        num_heads_q=num_heads,
+        num_heads_kv=num_kv_heads,
+        head_dim=head_dim,
+    )
+
+    opcheck(
+        torch.ops._C.fused_qk_norm_rope,
+        (
+            qkv_fused.clone(),
+            num_heads,
+            num_kv_heads,
+            num_kv_heads,
+            head_dim,
+            eps,
+            q_weight,
+            k_weight,
+            rope.cos_sin_cache,
+            is_neox,
+            positions.view(-1),
+        ),
+    )
+
+    torch.ops._C.fused_qk_norm_rope(
+        qkv_fused,
+        num_heads,
+        num_kv_heads,
+        num_kv_heads,
+        head_dim,
+        eps,
+        q_weight,
+        k_weight,
+        rope.cos_sin_cache,
+        is_neox,
+        positions.view(-1),
+    )
+
+    if dtype == torch.float16:
+        ATOL, RTOL = (2e-3, 2e-3)
+    else:
+        ATOL, RTOL = (1e-2, 1e-2)
+
+    torch.testing.assert_close(
+        qkv_fused,
+        ref_result,
+        atol=ATOL,
+        rtol=RTOL,
+    )
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 21eeffb1c726..d79fdfbe07af 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -6,6 +6,10 @@
 
 # Fused experts and PrepareFinalize imports
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
     BatchedDeepGemmExperts,
 )
@@ -21,7 +25,6 @@
     BatchedTritonExperts,
     NaiveBatchedExperts,
 )
-from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, TritonExperts
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP,
 )
@@ -399,9 +402,7 @@ def make_prepare_finalize(
     quant_config: FusedMoEQuantConfig,
 ) -> mk.FusedMoEPrepareAndFinalize:
     if backend != "naive" and backend is not None:
-        prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize(
-            moe, quant_config
-        )
+        prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
         assert prepare_finalize is not None
         return prepare_finalize
     elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 62704bbcbbc7..2285709fa7d6 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -40,8 +40,6 @@
 TOP_KS = [1, 2, 6]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclass
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index cd34617ee0fc..88db4b3e537c 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -33,8 +33,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
index 3799e60f1294..e35ca4caa9db 100644
--- a/tests/kernels/moe/test_block_int8.py
+++ b/tests/kernels/moe/test_block_int8.py
@@ -18,8 +18,6 @@
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.bfloat16]
 
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 5512ccce47b0..c15837f14570 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -42,8 +42,6 @@
 ]
 
 vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclasses.dataclass
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 9d039b81690a..455ecacef5ec 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -7,6 +7,7 @@
 """
 
 import dataclasses
+from contextlib import contextmanager
 
 import pytest
 import torch.distributed
@@ -14,6 +15,7 @@
 from typing_extensions import ParamSpec
 
 from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
@@ -21,7 +23,11 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
+from vllm.utils.deep_gemm import (
+    get_mk_alignment_for_contiguous_layout,
+    is_deep_gemm_e8m0_used,
+    is_deep_gemm_supported,
+)
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
 
 from ...utils import multi_gpu_test
@@ -57,6 +63,23 @@
 P = ParamSpec("P")
 
 
+@contextmanager
+def with_dp_metadata(M: int, world_size: int):
+    num_tokens_across_dp = torch.tensor([M] * world_size, device="cpu", dtype=torch.int)
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.data_parallel_size = world_size
+    vllm_config.parallel_config.enable_expert_parallel = True
+
+    with set_forward_context(
+        None,
+        vllm_config,
+        num_tokens=M,
+        num_tokens_across_dp=num_tokens_across_dp,
+    ):
+        yield
+
+
 def next_power_of_2(x):
     import math
 
@@ -281,18 +304,21 @@ def build_expert_map():
         quant_config=quant_config,
     )
 
-    out = mk.forward(
-        hidden_states=test_tensors.rank_tokens,
-        w1=w1,
-        w2=w2,
-        topk_weights=test_tensors.topk_weights,
-        topk_ids=test_tensors.topk,
-        inplace=False,
-        activation="silu",
-        global_num_experts=num_experts,
-        expert_map=build_expert_map(),
-        apply_router_weight_on_input=False,
-    )
+    with with_dp_metadata(
+        M=test_tensors.rank_tokens.size(0), world_size=pgi.world_size
+    ):
+        out = mk.forward(
+            hidden_states=test_tensors.rank_tokens,
+            w1=w1,
+            w2=w2,
+            topk_weights=test_tensors.topk_weights,
+            topk_ids=test_tensors.topk,
+            inplace=False,
+            activation="silu",
+            global_num_experts=num_experts,
+            expert_map=build_expert_map(),
+            apply_router_weight_on_input=False,
+        )
     return out
 
 
@@ -413,19 +439,16 @@ def _test_deepep_deepgemm_moe(
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(
-    is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
-)
 def test_ht_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
     num_experts: int,
     topk: int,
     world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
 ):
     """
     Tests for High-Throughput DeepEP + DeepGemm integration.
     """
-    import deep_gemm
 
     m, n, k = mnk
     current_platform.seed_everything(7)
@@ -433,7 +456,7 @@ def test_ht_deepep_deepgemm_moe(
     if topk > num_experts:
         pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
 
-    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_m = get_mk_alignment_for_contiguous_layout()[0]
     block_size = [block_m, block_m]
 
     world_size, dp_size = world_dp_size
@@ -487,9 +510,6 @@ def test_ht_deepep_deepgemm_moe(
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(
-    is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
-)
 def test_ll_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
     num_experts: int,
@@ -497,10 +517,12 @@ def test_ll_deepep_deepgemm_moe(
     use_fp8_dispatch: bool,
     block_size: list[int],
     world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
 ):
     """
     Tests for Low-Latency DeepEP + DeepGemm integration.
     """
+    assert not is_deep_gemm_e8m0_used()
 
     m, n, k = mnk
     current_platform.seed_everything(7)
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index b49319a7e6f5..d78b8250463a 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -294,7 +294,7 @@ def torch_moe_impl(
         # blockwise quant and de-quant.
         assert not per_act_token_quant
         a = test_tensors.rank_tokens
-        aq, aq_scale = per_token_group_quant_fp8(a, 128)
+        aq, aq_scale = per_token_group_quant_fp8(a, 128, use_ue8m0=False)
         a = (
             (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1))
             .view(a.shape)
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 707068b2bbdc..218df4a2632c 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -45,8 +45,6 @@
 ]
 
 vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def quant_fp8_per_tensor_batches(a):
@@ -79,10 +77,14 @@ class TestData:
 
     @staticmethod
     def make_moe_tensors_8bit(
-        m: int, k: int, n: int, e: int, reorder: bool
+        m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu"
     ) -> "TestData":
+        is_gated = activation != "relu2_no_mul"
+
         hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
-        w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
+        w13 = torch.randn(
+            (e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16
+        )
         w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
 
         # Scale to fp8
@@ -192,18 +194,22 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"])
 def test_flashinfer_cutlass_moe_fp8_no_graph(
     m: int,
     n: int,
     k: int,
     e: int,
     topk: int,
+    activation: str,
     monkeypatch,
 ):
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
-        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
+        td = TestData.make_moe_tensors_8bit(
+            m, k, n, e, reorder=False, activation=activation
+        )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
@@ -235,7 +241,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=False,
-            activation="silu",
+            activation=activation,
             global_num_experts=e,
             expert_map=None,
             apply_router_weight_on_input=True,
@@ -255,7 +261,7 @@ def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig:
             td.layer,
             topk_weights,
             topk_ids,
-            activation="silu",
+            activation=activation,
             global_num_experts=e,
             expert_map=None,
             apply_router_weight_on_input=True,
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 014df1fa111f..0550c2d9e212 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -6,6 +6,8 @@
 """
 
 import functools
+import importlib
+import sys
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any
@@ -20,6 +22,7 @@
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.moe.utils import fused_moe
 from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import init_distributed_environment
 from vllm.forward_context import set_forward_context
@@ -78,8 +81,6 @@
 ]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def run_moe_test(
@@ -412,14 +413,12 @@ def test_mixtral_moe(
     huggingface."""
 
     # clear the cache before every test
-    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-        is_rocm_aiter_moe_enabled,
-    )
+    # Force reload aiter_ops to pick up the new environment variables.
+    if "rocm_aiter_ops" in sys.modules:
+        importlib.reload(rocm_aiter_ops)
 
-    is_rocm_aiter_moe_enabled.cache_clear()
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
-
         if dtype == torch.float32:
             pytest.skip("AITER ROCm test skip for float32")
 
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index a2de64974b35..dd4eb4da913b 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -192,8 +192,6 @@ def pplx_cutlass_moe(
 
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def _pplx_moe(
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 0f0ed3326d15..f671b23d300c 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -81,8 +81,6 @@
 DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def torch_prepare(
diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
index 97a55c37b9a3..d6b78dd2c232 100644
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
+import random
+
 import pytest
 import torch
 
@@ -8,26 +11,30 @@
     persistent_masked_m_silu_mul_quant,
 )
 from vllm.platforms import current_platform
-from vllm.utils.math_utils import cdiv
+from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
+from vllm.utils.math_utils import cdiv, round_up
 
 fp8_dtype = torch.float8_e4m3fn
 
 CASES = [
     (1, 1, 128, fp8_dtype),
-    (1, 4, 128, fp8_dtype),
-    (2, 4, 256, fp8_dtype),
-    (32, 64, 256, fp8_dtype),
-    (17, 31, 768, fp8_dtype),
-    (1, 1, 128 * 1, fp8_dtype),
-    (1, 1, 128 * 3, fp8_dtype),
-    (1, 1, 128 * 4, fp8_dtype),
-    (8, 16, 128 * 1, fp8_dtype),
-    (8, 16, 128 * 2, fp8_dtype),
-    (8, 16, 128 * 3, fp8_dtype),
+    (1, 4, 128 * 1, fp8_dtype),
+    (2, 4, 128 * 2, fp8_dtype),
+    (1, 4, 128 * 3, fp8_dtype),
+    (8, 16, 128 * 4, fp8_dtype),
+    (8, 16, 128 * 5, fp8_dtype),
+    (8, 16, 128 * 6, fp8_dtype),
+    (8, 16, 128 * 7, fp8_dtype),
+    (8, 16, 128 * 8, fp8_dtype),
+    (8, 16, 128 * 9, fp8_dtype),
     (8, 64, 7168, fp8_dtype),
+    (8, 128, 128 * 33, fp8_dtype),
+    (1, 4, 128 * 10, fp8_dtype),
     (8, 128, 7168, fp8_dtype),
     (8, 512, 7168, fp8_dtype),
     (8, 1024, 7168, fp8_dtype),
+    (17, 31, 768, fp8_dtype),
+    (32, 64, 256, fp8_dtype),
     (256, 8, 7168, fp8_dtype),
     (256, 32, 7168, fp8_dtype),
     (256, 64, 7168, fp8_dtype),
@@ -37,14 +44,159 @@
 ]
 
 
+def as_uint8(x) -> torch.Tensor:
+    return (
+        torch.empty(x.shape, dtype=x.dtype, device=x.device).copy_(x).view(torch.uint8)
+    )
+
+
+def silu(x: torch.Tensor) -> torch.Tensor:
+    one_f32 = torch.tensor([1.0], device=x.device, dtype=torch.float32)
+    x_f32 = x.to(torch.float32)
+    act_f32 = x_f32 / (one_f32 + torch.exp(-x_f32))
+    assert act_f32.dtype == torch.float32
+    return act_f32.to(torch.bfloat16)
+
+
+def do_quant(x: torch.Tensor, group_size: int, ceil_ue8m0: bool):
+    eps_bf16 = torch.tensor([1e-10], device=x.device, dtype=torch.bfloat16)
+    one_bf16 = torch.tensor([1.0], device=x.device, dtype=torch.bfloat16)
+    fp8_max_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).max], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_min_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).min], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_max_inv = one_bf16 / fp8_max_bf16
+    assert fp8_max_inv.dtype == torch.bfloat16
+
+    assert x.size(-1) % group_size == 0
+    num_groups = x.numel() // group_size
+    x_og_shape = x.shape
+
+    x = x.to(torch.bfloat16)
+    x = x.view((-1, group_size))
+    amax = x.abs().amax(dim=1).clamp(min=eps_bf16)
+    assert amax.dtype == torch.bfloat16
+    s = amax * fp8_max_inv
+
+    if ceil_ue8m0:
+        s = torch.exp2(
+            torch.ceil(torch.log2(s).to(torch.bfloat16)).to(torch.bfloat16)
+        ).to(torch.bfloat16)
+
+    inv_s = one_bf16 / s
+    inv_s = inv_s.view((num_groups, 1))
+    xq = torch.clamp(x * inv_s, min=fp8_min_bf16.item(), max=fp8_max_bf16.item()).to(
+        fp8_dtype
+    )
+
+    xq = xq.view(x_og_shape)
+    xs = s.view((-1, xq.size(-1) // group_size))
+    return xq, xs
+
+
+def silu_mul_quant(
+    gate: torch.Tensor, up: torch.Tensor, group_size: int, ceil_ue8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert gate.size(-1) % group_size == 0
+    assert up.size(-1) % group_size == 0
+
+    assert gate.dtype == torch.bfloat16
+    assert up.dtype == torch.bfloat16
+
+    act_bf16 = silu(gate)
+    assert act_bf16.dtype == torch.bfloat16
+
+    # act & mul
+    a_m = act_bf16 * up
+    assert a_m.dtype == torch.bfloat16
+
+    q, s = do_quant(a_m, group_size, ceil_ue8m0)
+    return q, s
+
+
+def pack_scales(x: torch.Tensor, tokens_per_expert: torch.Tensor) -> torch.Tensor:
+    """
+    pack float32 scales into a int32 tensor
+    """
+    assert x.dtype == torch.float32
+    E, T, G = x.size()
+
+    # Add i32_padding here so we can view it as a i32 tensor later on.
+    i32_padding = round_up(G, 4) - G
+    ref_s_i8 = torch.empty((E, T, G + i32_padding), dtype=torch.uint8, device="cuda")
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        ref_s_i8[e, :nt, :G] = x[e, :nt].view(torch.int32) >> 23
+
+    ref_s_i32 = ref_s_i8.view(torch.int32)
+
+    return ref_s_i32
+
+
+def ref_with_scale_fmt(
+    E: int,
+    T: int,
+    H: int,
+    group_size: int,
+    tokens_per_expert: torch.Tensor,
+    gate: torch.Tensor,
+    up: torch.Tensor,
+    scale_fmt: DeepGemmQuantScaleFMT,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    The precision types of the operations triggered by this function
+    match closely with the kernel implementation so we compare more
+    accurately.
+    """
+    scale_dtype = (
+        torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
+    )
+    ceil_ue8m0 = scale_fmt in [
+        DeepGemmQuantScaleFMT.UE8M0,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]
+
+    ref_q = torch.empty((E, T, H), dtype=fp8_dtype, device="cuda")
+    ref_s_f32 = torch.empty(
+        (E, T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
+    )
+
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        if nt == 0:
+            continue
+        ref_q[e, :nt], ref_s_f32[e, :nt] = silu_mul_quant(
+            gate[e, :nt], up[e, :nt], group_size, ceil_ue8m0=ceil_ue8m0
+        )
+
+    if scale_dtype == torch.float32:
+        return ref_q, ref_s_f32
+
+    assert scale_dtype == torch.int32
+    return ref_q, pack_scales(ref_s_f32, tokens_per_expert)
+
+
+def token_random(E, T, H2, tokens_per_expert):
+    """
+    Initialize each token in a random range so we test a range of
+    scale values.
+    """
+    y = torch.empty((E, T, H2), dtype=torch.bfloat16, device="cuda")
+    for e in range(E):
+        for t in range(tokens_per_expert[e].item()):
+            exp = random.choice(range(1, 20))
+            y[e, t].uniform_(-(2**exp), 2**exp)
+    return y
+
+
 @pytest.mark.parametrize("E,T,H,fp8_type", CASES)
 @torch.inference_mode()
-def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
+def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
     group_size = 128
     current_platform.seed_everything(42)
 
-    # Input tensor of shape (E, T, 2*H)
-    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
     tokens_per_expert = torch.randint(
         low=0,
         high=T,
@@ -53,69 +205,83 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
         device="cuda",
     )
 
-    # Run the SiLU V2 kernel
-    y_q, y_s = persistent_masked_m_silu_mul_quant(
-        y, tokens_per_expert, group_size=group_size
-    )
+    # Input tensor of shape (E, T, 2*H)
+    y = token_random(E, T, 2 * H, tokens_per_expert)
 
-    torch.cuda.synchronize()
-    fp8_info = torch.finfo(fp8_dtype)
-    fp8_max = fp8_info.max
-    fp8_min = fp8_info.min
-    eps = 1e-10
+    gate = y[..., :H].to(torch.bfloat16)
+    up = y[..., H:].to(torch.bfloat16)
 
-    y1 = y[..., :H].float()
-    y2 = y[..., H:]
-    silu_x = y1 * torch.sigmoid(y1)
-    merged = silu_x * y2
+    scale_fmts = [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
 
-    for e in range(E):
-        nt = tokens_per_expert[e].item()
-        ref_s = torch.empty(
-            (T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
+    # Run the SiLU V2 kernel
+    for scale_fmt in scale_fmts:
+        y_q, y_s = persistent_masked_m_silu_mul_quant(
+            y,
+            tokens_per_expert,
+            group_size=group_size,
+            quant_scale_fmt=scale_fmt,
         )
-        ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda")
 
-        for t in range(nt):
-            data = merged[e, t].float()
-            ref_q_row = torch.empty_like(data)
+        ref_y_q, ref_y_s = ref_with_scale_fmt(
+            E, T, H, group_size, tokens_per_expert, gate, up, scale_fmt=scale_fmt
+        )
 
-            # process full groups
-            n_full_groups = H // group_size
-            if n_full_groups > 0:
-                data_grp = data[: n_full_groups * group_size].view(
-                    n_full_groups, group_size
-                )
-                amax = data_grp.abs().amax(dim=1).clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data[: n_full_groups * group_size] / scale.repeat_interleave(
-                    group_size
-                )
-                ref_q_row[: n_full_groups * group_size] = scaled.clamp(
-                    fp8_min, fp8_max
-                ).to(fp8_dtype)
-                ref_s[t, :n_full_groups] = scale
-
-            # process remainder group
-            rem = H % group_size
-            if rem > 0:
-                data_rem = data[-rem:]
-                amax = data_rem.abs().amax().clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data_rem / scale
-                ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype)
-                ref_s[t, -1] = scale
-
-            ref_q[t] = ref_q_row
-
-        y_se = y_s[e].float()
-        y_qe = y_q[e].float()
-
-        torch.testing.assert_close(
-            y_qe[:nt].to(torch.float32),
-            ref_q[:nt].to(torch.float32),
-            atol=2,
-            rtol=2e-1,
+        # deepgemm scales transform
+        dg_scales = None
+        if (
+            has_deep_gemm()
+            and current_platform.has_device_capability(100)
+            and scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+        ):
+            from deep_gemm import transform_sf_into_required_layout
+
+            _q, _s = ref_with_scale_fmt(
+                E,
+                T,
+                H,
+                group_size,
+                tokens_per_expert,
+                gate,
+                up,
+                scale_fmt=DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+            )
+            dg_scales = transform_sf_into_required_layout(
+                sf=_s,
+                mn=_q.size(1),
+                k=_q.size(2),
+                recipe=(1, 128, 128),
+                num_groups=_q.size(0),
+                is_sfa=True,
+            )
+
+        expected_scale_dtype = (
+            torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
         )
+        assert y_s.dtype == expected_scale_dtype
+        assert ref_y_s.dtype == expected_scale_dtype
 
-        torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
+        for e in range(E):
+            nt = tokens_per_expert[e].item()
+
+            torch.testing.assert_close(
+                y_q[e, :nt].to(torch.float32),
+                ref_y_q[e, :nt].to(torch.float32),
+            )
+
+            if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
+                G = H // group_size
+                y_s_sliced = as_uint8(y_s[e])
+                ref_s_sliced = as_uint8(ref_y_s[e])
+                torch.testing.assert_close(y_s_sliced[:nt, :G], ref_s_sliced[:nt, :G])
+                if dg_scales is not None:
+                    dg_sliced = as_uint8(dg_scales[e])
+                    torch.testing.assert_close(y_s_sliced[:nt, :G], dg_sliced[:nt, :G])
+            else:
+                torch.testing.assert_close(
+                    y_s[e, :nt],
+                    ref_y_s[e, :nt],
+                )
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index 933cd9dbdeaa..7a467e160b78 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -18,8 +18,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index 55f092e7ea69..e9973c1fcc15 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -29,8 +29,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index dabc10a122f7..310091b6a554 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -18,8 +18,6 @@
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.half, torch.bfloat16]
 M = [1, 33, 64, 222]
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index dc6557b93f05..15ff6d536413 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -8,6 +8,7 @@
 import vllm._custom_ops as ops
 from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
 from vllm.platforms import current_platform
+from vllm.utils.platform_utils import get_cu_count
 
 DTYPES = [torch.bfloat16, torch.float16]
 # Specific (N, K, M) combinations for targeted testing
@@ -85,7 +86,7 @@ def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
-    cu_count = current_platform.get_cu_count()
+    cu_count = get_cu_count()
 
     A = torch.rand(n, k, dtype=dtype, device="cuda") - 0.5
     B = torch.rand(m, k, dtype=dtype, device="cuda") - 0.5
@@ -102,7 +103,7 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_bias1D_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
-    cu_count = current_platform.get_cu_count()
+    cu_count = get_cu_count()
 
     xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
     A = (torch.rand(n, k, dtype=dtype, device="cuda") - 0.5) * xavier
@@ -121,7 +122,7 @@ def test_rocm_wvsplitk_bias1D_kernel(n, k, m, dtype, seed):
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
-    cu_count = current_platform.get_cu_count()
+    cu_count = get_cu_count()
 
     xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
     A = (torch.rand(n, k, dtype=dtype, device="cuda") - 0.5) * xavier
@@ -153,7 +154,14 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
     ref_out = torch._scaled_mm(
         A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b
     )
-    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b, current_platform.get_cu_count())
+    out = ops.wvSplitKQ(
+        B,
+        A,
+        dtype,
+        scale_a,
+        scale_b,
+        get_cu_count(),
+    )
 
     assert torch.allclose(out, ref_out, rtol=0.01)
 
@@ -180,7 +188,13 @@ def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed):
         A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b, bias=BIAS
     )
     out = ops.wvSplitKQ(
-        B, A, dtype, scale_a, scale_b, current_platform.get_cu_count(), BIAS
+        B,
+        A,
+        dtype,
+        scale_a,
+        scale_b,
+        get_cu_count(),
+        BIAS,
     )
 
     assert torch.allclose(out, ref_out, rtol=0.01)
diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py
index c9eca1f86d3a..4e681ca6ac83 100644
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Integration tests for FlexAttention backend vs default backend"""
 
 import pytest
 import torch
diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py
deleted file mode 100644
index 4b0bbb992d2e..000000000000
--- a/tests/kernels/test_triton_flash_attention.py
+++ /dev/null
@@ -1,516 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the triton_flash_attention kernel
-
-Run `pytest tests/kernels/test_triton_flash_attention.py`.
-"""
-
-import pytest
-import torch
-
-from vllm.attention.ops.triton_flash_attention import (
-    SUPPORTED_LAYOUTS,
-    MetaData,
-    compute_alibi_tensor,
-    scale_fp8,
-    triton_attention_rocm,
-)
-from vllm.platforms import current_platform
-
-
-class ReferenceAttention:
-    def __init__(
-        self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype, input_metadata
-    ):
-        self.Z = Z
-        self.HQ = HQ
-        self.HK = HK
-        self.N_CTX_Q = N_CTX_Q
-        self.N_CTX_K = N_CTX_K
-        self.D_HEAD = D_HEAD
-        self.use_alibi = use_alibi
-        self.dtype = dtype
-        self.input_metadata = input_metadata
-
-    def fwd(self, q, k, v):
-        scores = (
-            torch.einsum("bhqd,bhkd->bhqk", q, k).float() * self.input_metadata.sm_scale
-        )
-        if self.input_metadata.causal:
-            mask = torch.tril(
-                torch.ones(self.N_CTX_Q, self.N_CTX_K, device="cuda"),
-                diagonal=self.N_CTX_K - self.N_CTX_Q,
-            )
-            scores[:, :, mask == 0] = float("-inf")
-
-        if self.input_metadata.bias is not None:
-            scores += self.input_metadata.bias
-
-        if self.use_alibi:
-            scores += compute_alibi_tensor(
-                self.input_metadata.alibi_slopes, self.N_CTX_Q, self.N_CTX_K
-            )
-
-        p = torch.softmax(scores, dim=-1)
-        if self.input_metadata.causal:
-            # If N_CTX_Q > N_CTX_K, there's at least one row of all -infs going
-            # into softmax. This creates a row of NaNs as -inf - -inf == NaN.
-            # So we fix this by converting the NaNs to 0s, which is what they
-            # should be out of the softmax.
-            nan_mask = torch.isnan(p)
-            p[nan_mask == 1] = 0
-        ref_out = torch.einsum("bhqk,bhkd->bhqd", p.to(self.dtype), v)
-        # compare
-        if self.input_metadata.layout == "bshd":
-            ref_out = ref_out.transpose(1, 2).clone()
-        return ref_out
-
-    def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
-        q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
-            self.dtype
-        )
-        k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
-            self.dtype
-        )
-        v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
-            self.dtype
-        )
-        result = self.fwd(q, k, v)
-        if self.input_metadata.o_scale is not None:
-            result, _ = scale_fp8(result, self.input_metadata.o_scale)
-        return result
-
-    def fwd_fp8_kv(self, q, k_quantized, v_quantized):
-        k_descale, v_descale = (
-            self.input_metadata.k_descale,
-            self.input_metadata.v_descale,
-        )
-        k_dequantized = (
-            k_quantized.to(torch.float32) * k_descale.to(torch.float32)
-        ).to(self.dtype)
-        v_dequantized = (
-            v_quantized.to(torch.float32) * v_descale.to(torch.float32)
-        ).to(self.dtype)
-        return self.fwd(q, k_dequantized, v_dequantized)
-
-    def varlen_fwd(self, q, k, v, is_mqa=False):
-        ref_out = torch.empty_like(q)
-        if is_mqa:
-            # Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
-            # the size aligns with Q.
-            k_ref = k.view(k.shape[0], k.shape[1], 1, k.shape[2]).expand(
-                -1, -1, self.HQ // self.HK, -1
-            )
-            v_ref = v.view(v.shape[0], v.shape[1], 1, v.shape[2]).expand(
-                -1, -1, self.HQ // self.HK, -1
-            )
-        else:
-            k_ref = k
-            v_ref = v
-
-        for i in range(0, self.input_metadata.num_contexts):
-            start_q, start_k = (
-                self.input_metadata.cu_seqlens_q[i],
-                self.input_metadata.cu_seqlens_k[i],
-            )
-            end_q, end_k = (
-                self.input_metadata.cu_seqlens_q[i + 1],
-                self.input_metadata.cu_seqlens_k[i + 1],
-            )
-            k_curr = k_ref[start_k:end_k]
-            v_curr = v_ref[start_k:end_k]
-            if is_mqa:
-                k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
-                v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
-            scores = torch.einsum("qhd,khd->qhk", q[start_q:end_q], k_curr).float()
-            p = torch.softmax(scores * self.input_metadata.sm_scale, dim=-1).half()
-            ref_out[start_q:end_q] = torch.einsum("qhk,khd->qhd", p, v_curr)
-        return ref_out
-
-
-def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
-    q_descale = None
-    if not fp8_kv:
-        q, q_descale = scale_fp8(q)
-    k, k_descale = scale_fp8(k)
-    v, v_descale = scale_fp8(v)
-
-    # In real world use case, the p scale would be a parameter trained by the
-    # model.
-    p_scale = None
-
-    o_scale = torch.rand(1, device="cuda", requires_grad=False) if use_o_scale else None
-
-    return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
-
-
-def input_helper(
-    Z,
-    HQ,
-    HK,
-    N_CTX_Q,
-    N_CTX_K,
-    D_HEAD,
-    dtype,
-    layout=None,
-    use_alibi=None,
-    causal=None,
-    is_fp8=False,
-    fp8_kv=False,
-    use_o_scale=False,
-    use_bias=False,
-):
-    assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
-
-    current_platform.seed_everything(0)
-
-    # Initialize q, k, v
-    if layout == "bhsd":
-        q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
-        k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
-    elif layout == "bshd":
-        q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
-        k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
-
-    if use_alibi:
-        # for n heads the set of slopes is the geometric sequence that starts
-        # 2^(-8/n)
-        alibi_slopes = torch.tensor(
-            [2 ** (-8 / HQ * i) for i in range(1, HQ + 1)],
-            dtype=torch.float32,
-            device="cuda",
-        ).repeat(Z, 1)
-    else:
-        alibi_slopes = None
-
-    if use_bias:
-        bias = torch.randn(
-            (1, HQ, N_CTX_Q, N_CTX_K), dtype=dtype, device="cuda", requires_grad=False
-        )
-    else:
-        bias = None
-
-    q = torch.randn(q_tensor_shape, dtype=dtype, device="cuda", requires_grad=False)
-    k = torch.randn(k_tensor_shape, dtype=dtype, device="cuda", requires_grad=False)
-    v = torch.randn(k_tensor_shape, dtype=dtype, device="cuda", requires_grad=False)
-
-    if is_fp8:
-        (q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale) = quantize_input(
-            q, k, v, use_o_scale=use_o_scale, fp8_kv=fp8_kv
-        )
-    else:
-        q_descale = k_descale = v_descale = p_scale = o_scale = None
-
-    input_metadata = MetaData(
-        sm_scale=D_HEAD**-0.5,
-        max_seqlens_q=N_CTX_Q,
-        max_seqlens_k=N_CTX_K,
-        layout=layout,
-        alibi_slopes=alibi_slopes,
-        alibi_batch=Z,
-        alibi_nheads=HQ,
-        q_descale=q_descale,
-        k_descale=k_descale,
-        v_descale=v_descale,
-        p_scale=p_scale,
-        o_scale=o_scale,
-        bias=bias,
-        seqlen_q=N_CTX_Q,
-        seqlen_k=N_CTX_K,
-    )
-    return q, k, v, input_metadata
-
-
-def varlen_input_helper(
-    Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, dtype, equal_seqlens=False
-):
-    current_platform.seed_everything(0)
-
-    # Random sequence lengths. Using N_CTX as kind of max of sum of individual
-    # seqs
-    if not equal_seqlens:
-        max_seqlens_q = N_CTX_Q // Z
-        max_seqlens_k = N_CTX_K // Z
-        seqlens_q = torch.randint(1, max_seqlens_q + 1, (Z,), dtype=torch.int32)
-        seqlens_k = torch.randint(1, max_seqlens_k + 1, (Z,), dtype=torch.int32)
-    else:
-        seqlens_q = torch.full((Z,), N_CTX_Q // Z)
-        seqlens_k = torch.full((Z,), N_CTX_K // Z)
-
-    # Calculate cumulative sequence lengths
-    cu_seqlens_q = torch.cat(
-        [
-            torch.tensor([0], dtype=torch.int32),
-            seqlens_q.cumsum(dim=0, dtype=torch.int32),
-        ]
-    )
-    cu_seqlens_k = torch.cat(
-        [
-            torch.tensor([0], dtype=torch.int32),
-            seqlens_k.cumsum(dim=0, dtype=torch.int32),
-        ]
-    )
-    cu_seqlens_q = cu_seqlens_q.to(device="cuda")
-    cu_seqlens_k = cu_seqlens_k.to(device="cuda")
-
-    # Initialize q, k, v with variable lengths
-    total_q = cu_seqlens_q[-1].item()
-    total_k = cu_seqlens_k[-1].item()
-    q = (
-        torch.randn((total_q, HQ, D_HEAD), dtype=dtype, device="cuda")
-        .normal_(mean=0.0, std=0.5)
-        .requires_grad_()
-    )
-    k = (
-        torch.randn((total_k, HK, D_HEAD), dtype=dtype, device="cuda")
-        .normal_(mean=0.0, std=0.5)
-        .requires_grad_()
-    )
-    v = (
-        torch.randn((total_k, HK, D_HEAD), dtype=dtype, device="cuda")
-        .normal_(mean=0.0, std=0.5)
-        .requires_grad_()
-    )
-    sm_scale = D_HEAD**-0.5
-    input_metadata = MetaData(sm_scale=sm_scale)
-    input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
-    return q, k, v, input_metadata
-
-
-@pytest.mark.parametrize(
-    "Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD",
-    [
-        (1, 48, 12, 1, 1, 64),
-        (4, 4, 4, 128, 128, 65),
-        (16, 48, 48, 1, 1, 128),
-        (64, 48, 24, 3, 3, 128),
-        (4, 4, 4, 113, 123, 1),
-    ],
-)
-@pytest.mark.parametrize("causal", [True, False])
-@pytest.mark.parametrize("use_alibi", [True, False])
-@pytest.mark.parametrize("layout", ["bshd"])
-def test_op_fwd(
-    Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_alibi, layout, dtype=torch.float16
-):
-    current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(
-        Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, dtype, layout, use_alibi, causal
-    )
-
-    o = torch.empty_like(q)
-
-    # triton implementation
-    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
-
-    # Transpose here if layout is bshd so we have same reference code for all
-    # layouts
-    if layout == "bshd":
-        q = q.transpose(1, 2).clone()
-        k = k.transpose(1, 2).clone()
-        v = v.transpose(1, 2).clone()
-    # Replicate K and V if using MQA/GQA
-    if HQ != HK:
-        k = (
-            k.view(k.shape[0], k.shape[1], -1, k.shape[2], k.shape[3])
-            .expand(-1, -1, HQ // HK, -1, -1)
-            .reshape(k.shape[0], -1, k.shape[2], k.shape[3])
-        )
-        v = (
-            v.view(v.shape[0], v.shape[1], -1, v.shape[2], v.shape[3])
-            .expand(-1, -1, HQ // HK, -1, -1)
-            .reshape(v.shape[0], -1, v.shape[2], v.shape[3])
-        )
-
-    ref_impl = ReferenceAttention(
-        Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype, input_metadata
-    )
-    ref_out = ref_impl.fwd(q, k, v)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-
-
-@pytest.mark.parametrize(
-    "Z, H, N_CTX_Q, N_CTX_K, D_HEAD",
-    [
-        (4, 48, 1, 1, 64),
-        (4, 48, 1, 1, 128),
-        (4, 48, 3, 3, 128),
-        (4, 4, 128, 128, 65),
-    ],
-)
-@pytest.mark.parametrize("causal", [True, False])
-@pytest.mark.parametrize("layout", ["bhsd"])
-@pytest.mark.parametrize("use_o_scale", [True, False])
-@pytest.mark.skipif(
-    torch.cuda.get_device_capability() < (9, 0),
-    reason="Triton FP8 requires CUDA 9.0 or higher",
-)
-def test_op_fwd_fp8(
-    Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, layout, use_o_scale, dtype=torch.float32
-):
-    current_platform.seed_everything(0)
-
-    # Disable grad to save memory it won't run into OOM on CI machine.
-    # q, k, v, input_metadata = input_helper(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD,
-    # dtype, layout)
-
-    q_quantized, k_quantized, v_quantized, input_metadata = input_helper(
-        Z,
-        H,
-        H,
-        N_CTX_Q,
-        N_CTX_K,
-        D_HEAD,
-        dtype,
-        causal=causal,
-        layout=layout,
-        is_fp8=True,
-        use_o_scale=use_o_scale,
-    )
-
-    o = torch.empty_like(q_quantized) if use_o_scale else None
-
-    tri_out, _ = triton_attention_rocm(
-        q_quantized, k_quantized, v_quantized, o, input_metadata
-    )
-
-    ref_impl = ReferenceAttention(
-        Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
-
-    # compare
-    torch.testing.assert_close(
-        ref_out.to(torch.float32), tri_out.to(torch.float32), atol=7e-2, rtol=2e-1
-    )
-
-
-@pytest.mark.parametrize(
-    "Z, H, N_CTX_Q, N_CTX_K, D_HEAD",
-    [
-        (4, 48, 1, 1, 64),
-        (4, 48, 1, 1, 128),
-        (4, 48, 3, 3, 128),
-        (4, 4, 128, 128, 65),
-        (4, 4, 113, 123, 1),
-    ],
-)
-@pytest.mark.parametrize("causal", [True, False])
-@pytest.mark.parametrize("layout", ["bhsd"])
-def test_op_fwd_fp8_kv(
-    Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, layout, dtype=torch.float32
-):
-    current_platform.seed_everything(0)
-
-    q, k_quantized, v_quantized, input_metadata = input_helper(
-        Z,
-        H,
-        H,
-        N_CTX_Q,
-        N_CTX_K,
-        D_HEAD,
-        dtype,
-        causal=causal,
-        layout=layout,
-        is_fp8=True,
-        fp8_kv=True,
-    )
-
-    o = torch.empty_like(q)
-
-    tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o, input_metadata)
-
-    ref_impl = ReferenceAttention(
-        Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
-
-
-@pytest.mark.parametrize(
-    "Z, H, N_CTX_Q, N_CTX_K, D_HEAD",
-    [
-        (4, 48, 1, 1, 64),
-        (4, 48, 1, 1, 128),
-        (4, 48, 3, 3, 128),
-        (4, 4, 128, 128, 65),
-    ],
-)
-@pytest.mark.parametrize("causal", [True, False])
-@pytest.mark.parametrize("use_bias", [True])
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
-    current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(
-        Z,
-        H,
-        H,
-        N_CTX_Q,
-        N_CTX_K,
-        D_HEAD,
-        dtype,
-        layout="bhsd",
-        causal=causal,
-        use_bias=use_bias,
-    )
-    o = torch.empty_like(q)
-
-    # triton implementation
-    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
-
-    ref_impl = ReferenceAttention(
-        Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.fwd(q, k, v)
-
-    # compare
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-
-
-# NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize(
-    "Z, H, N_CTX, D_HEAD",
-    [(1, 48, 256, 64), (4, 48, 512, 64), (16, 48, 512, 64), (64, 48, 128, 128)],
-)
-@pytest.mark.parametrize("causal", [True, False])
-def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
-    q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX, D_HEAD, dtype)
-
-    tri_out = torch.empty_like(q)
-    triton_attention_rocm(q, k, v, tri_out, input_metadata)
-
-    ref_impl = ReferenceAttention(
-        Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-
-
-# NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize(
-    "Z, HQ, HK, N_CTX, D_HEAD",
-    [
-        (2, 48, 24, 128, 64),
-        (4, 48, 12, 256, 64),
-        (4, 48, 4, 512, 64),
-        (4, 64, 16, 128, 128),
-    ],
-)
-@pytest.mark.parametrize("causal", [False])
-def test_op_varlen_mqa_fwd(Z, HQ, HK, N_CTX, D_HEAD, causal, dtype=torch.float16):
-    q, k, v, input_metadata = varlen_input_helper(
-        Z, HQ, HK, N_CTX, N_CTX, D_HEAD, dtype
-    )
-
-    tri_out = torch.empty_like(q)
-    triton_attention_rocm(q, k, v, tri_out, input_metadata)
-
-    ref_impl = ReferenceAttention(
-        Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False, dtype, input_metadata
-    )
-    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
-
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
diff --git a/tests/model_executor/test_eagle_quantization.py b/tests/model_executor/test_eagle_quantization.py
new file mode 100644
index 000000000000..1ab75933ee31
--- /dev/null
+++ b/tests/model_executor/test_eagle_quantization.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+
+from vllm.config import LoadConfig, ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.model_executor.models.utils import get_draft_quant_config
+from vllm.platforms import current_platform
+
+DEVICES = (
+    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    if current_platform.is_cuda_alike()
+    else ["cpu"]
+)
+
+
+def test_get_draft_quant_config_with_draft_model():
+    mock_draft_model_config = Mock(spec=ModelConfig)
+    mock_load_config = Mock(spec=LoadConfig)
+    mock_speculative_config = Mock(spec=SpeculativeConfig)
+    mock_speculative_config.draft_model_config = mock_draft_model_config
+
+    mock_vllm_config = Mock(spec=VllmConfig)
+    mock_vllm_config.speculative_config = mock_speculative_config
+    mock_vllm_config.load_config = mock_load_config
+
+    mock_quant_config = Mock()
+    with patch.object(
+        VllmConfig, "get_quantization_config", return_value=mock_quant_config
+    ):
+        result = get_draft_quant_config(mock_vllm_config)
+
+        # Verify the function calls get_quantization_config with draft model config
+        VllmConfig.get_quantization_config.assert_called_once_with(
+            mock_draft_model_config, mock_load_config
+        )
+        assert result == mock_quant_config
+
+
+def test_get_draft_quant_config_without_draft_model():
+    mock_speculative_config = Mock(spec=SpeculativeConfig)
+    mock_speculative_config.draft_model_config = None
+
+    mock_vllm_config = Mock(spec=VllmConfig)
+    mock_vllm_config.speculative_config = mock_speculative_config
+    mock_vllm_config.load_config = Mock(spec=LoadConfig)
+
+    result = get_draft_quant_config(mock_vllm_config)
+
+    assert result is None
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("device", DEVICES)
+def test_fc_layer_quant_config_usage(dist_init, device) -> None:
+    import torch
+
+    from vllm.model_executor.layers.linear import ReplicatedLinear
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    torch.set_default_device(device)
+
+    input_size = 256
+    output_size = 128
+
+    fc_no_quant = ReplicatedLinear(
+        input_size=input_size,
+        output_size=output_size,
+        bias=False,
+        params_dtype=torch.float16,
+        quant_config=None,
+        prefix="fc",
+    )
+
+    assert fc_no_quant.quant_config is None
+    assert fc_no_quant.input_size == input_size
+    assert fc_no_quant.output_size == output_size
+
+    mock_quant_config = Mock()
+    fc_with_quant = ReplicatedLinear(
+        input_size=input_size,
+        output_size=output_size,
+        bias=False,
+        params_dtype=torch.float16,
+        quant_config=mock_quant_config,
+        prefix="fc",
+    )
+
+    assert fc_with_quant.quant_config == mock_quant_config
+
+    # Check forward pass
+    x = torch.randn(2, input_size, dtype=torch.float16)
+    output, _ = fc_no_quant(x)
+    assert output.shape == (2, output_size)
+
+
+def test_kv_cache_scale_name_handling():
+    # Mock a quant config that supports cache scales
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
+
+    # Condition check in load_weights
+    name = "layers.0.self_attn.k_proj.weight"
+    scale_name = mock_quant_config.get_cache_scale(name)
+
+    # Check if get_cache_scale is called and returns expected value
+    mock_quant_config.get_cache_scale.assert_called_once_with(name)
+    assert scale_name == "layers.0.self_attn.kv_scale"
+
+
+def test_kv_cache_scale_name_no_scale():
+    # Mock a quant config that returns None for get_cache_scale
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value=None)
+
+    name = "layers.0.mlp.gate_proj.weight"
+    scale_name = mock_quant_config.get_cache_scale(name)
+
+    # Should return None for weights that don't have cache scales
+    assert scale_name is None
+
+
+def test_maybe_remap_kv_scale_name():
+    from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name
+
+    params_dict = {
+        "layers.0.self_attn.kv_scale": Mock(),
+        "layers.1.self_attn.kv_scale": Mock(),
+    }
+
+    name = "layers.0.self_attn.some_scale"
+    remapped = maybe_remap_kv_scale_name(name, params_dict)
+
+    assert remapped in params_dict or remapped == name or remapped is None
+
+
+def test_load_weights_kv_scale_handling():
+    kv_scale_param = Mock()
+    kv_scale_param.weight_loader = Mock()
+
+    params_dict = {
+        "layers.0.self_attn.kv_scale": kv_scale_param,
+    }
+
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
+
+    # Load_weights logic for KV cache scales
+    name = "layers.0.self_attn.k_proj.weight"
+    loaded_weight_tensor = torch.tensor([1.0, 2.0])
+
+    if mock_quant_config is not None:
+        scale_name = mock_quant_config.get_cache_scale(name)
+        if scale_name:
+            param = params_dict[scale_name]
+            assert param is kv_scale_param
+            weight_to_load = (
+                loaded_weight_tensor
+                if loaded_weight_tensor.dim() == 0
+                else loaded_weight_tensor[0]
+            )
+
+            assert scale_name == "layers.0.self_attn.kv_scale"
+            assert weight_to_load == loaded_weight_tensor[0]
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 41419553aa83..9121284de85b 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (
@@ -15,9 +16,6 @@
     dispatch_topk_func,
     vllm_topk_softmax,
 )
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_moe_enabled,
-)
 from vllm.model_executor.layers.layernorm import (
     RMSNorm,
     dispatch_rocm_rmsnorm_func,
@@ -126,50 +124,39 @@ def test_enabled_ops_invalid(env: str):
             RMSNorm(1024).enabled()
 
 
-@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    topk_func = dispatch_topk_func()
-    is_rocm_aiter_moe_enabled.cache_clear()
-    if current_platform.is_rocm() and int(use_rocm_aiter):
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            rocm_aiter_topk_softmax,
-        )
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_topk_dispatch(use_rocm_aiter: bool):
+    topk_func = dispatch_topk_func(use_rocm_aiter)
 
-        assert topk_func == rocm_aiter_topk_softmax
+    if current_platform.is_rocm() and use_rocm_aiter:
+        assert topk_func == rocm_aiter_ops.topk_softmax
     else:
         assert topk_func == vllm_topk_softmax
 
 
 @pytest.mark.parametrize("add_residual", [True, False])
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter", [True, False])
 @pytest.mark.skipif(
     not current_platform.is_rocm(), reason="AITER is a feature exclusive for ROCm"
 )
 def test_rms_norm_dispatch(
-    add_residual: bool,
-    dtype: torch.dtype,
-    use_rocm_aiter: str,
-    use_rocm_aiter_norm: str,
-    monkeypatch,
+    add_residual: bool, dtype: torch.dtype, use_rocm_aiter: bool
 ):
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
-    rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype)
+    rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype, use_rocm_aiter)
 
     should_use_rocm_aiter = (
         current_platform.is_rocm()
-        and int(use_rocm_aiter)
-        and int(use_rocm_aiter_norm)
+        and use_rocm_aiter
         and dtype in RMS_NORM_SUPPORTED_DTYPES
     )
 
     if add_residual and should_use_rocm_aiter:
-        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
+        assert rms_norm_func == rocm_aiter_ops.rms_norm2d_with_add
     elif should_use_rocm_aiter:
-        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rms_norm
+        assert rms_norm_func == rocm_aiter_ops.rms_norm
     elif add_residual:
         assert rms_norm_func == fused_add_rms_norm
     else:
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index ad37d1ad82c0..0cdb7c9a603f 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -38,7 +38,11 @@
     [
         pytest.param(
             "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model, pytest.mark.slow_test],
+            marks=[
+                pytest.mark.core_model,
+                pytest.mark.slow_test,
+                pytest.mark.cpu_model,
+            ],
         ),
         pytest.param(
             "openai-community/gpt2",  # gpt2
@@ -55,6 +59,10 @@
                 pytest.mark.slow_test,
             ],
         ),
+        pytest.param(
+            "google/gemma-2-2b-it",  # test hybrid attention
+            marks=[pytest.mark.cpu_model],
+        ),
         pytest.param(
             "zai-org/chatglm3-6b",  # chatglm (text-only)
         ),
@@ -64,7 +72,6 @@
         ),
         pytest.param(
             "openbmb/MiniCPM3-4B",
-            # fused_moe not supported on CPU
             marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
         ),
         pytest.param(
@@ -93,11 +100,7 @@
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
             "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.core_model],
-        ),
-        pytest.param(
-            "allenai/OLMoE-1B-7B-0924-Instruct",
-            marks=[pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param("swiss-ai/Apertus-8B-Instruct-2509"),  # apertus
     ],
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 681b380e6a15..37830093cd3c 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -348,9 +348,14 @@ def test_fp32_cache_state(
 
 
 # Helper functions for the APC tests
-def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
+def _get_vllm_runner_params(
+    model: str,
+    max_model_len: int,
+    tensor_parallel_size: int = 1,
+):
     return {
         "model_name": model,
+        "enable_chunked_prefill": True,
         "enable_prefix_caching": False,
         "max_model_len": max_model_len,
         "tensor_parallel_size": tensor_parallel_size,
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 471826f214d0..2723bb21de97 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -27,13 +27,7 @@ def test_models(
     example_prompts,
     model: str,
     dtype: str,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index c8deffbf66db..93b9843311d3 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -4,7 +4,6 @@
 import pytest
 
 from vllm.config import PoolerConfig
-from vllm.platforms import current_platform
 
 from ...utils import check_embeddings_close
 
@@ -23,8 +22,7 @@
         ),
         pytest.param(
             "intfloat/e5-mistral-7b-instruct",
-            # CPU v1 doesn't support sliding window
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
             "ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]
@@ -52,13 +50,7 @@ def test_models(
     vllm_runner,
     example_prompts,
     model,
-    monkeypatch,
 ) -> None:
-    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     vllm_extra_kwargs = {}
     if model == "ssmits/Qwen2-7B-Instruct-embed-base":
         vllm_extra_kwargs["pooler_config"] = PoolerConfig(
diff --git a/tests/models/language/pooling/test_extract_hidden_states.py b/tests/models/language/pooling/test_extract_hidden_states.py
index f8e3fa7d1560..0d41b93233d5 100644
--- a/tests/models/language/pooling/test_extract_hidden_states.py
+++ b/tests/models/language/pooling/test_extract_hidden_states.py
@@ -11,7 +11,7 @@
     ["Qwen/Qwen3-0.6B"],
 )
 @torch.inference_mode
-def test_embed_models(hf_runner, vllm_runner, model: str):
+def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
     n_prompt_tokens = [55, 56, 57]
     token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
 
@@ -21,7 +21,7 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
         enforce_eager=True,
         runner="pooling",
         enable_chunked_prefill=False,
-        enable_prefix_caching=False,
+        enable_prefix_caching=True,
     ) as vllm_model:
         pooling_outputs = vllm_model.llm.encode(
             [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
@@ -30,4 +30,29 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
 
         for n, output in zip(n_prompt_tokens, pooling_outputs):
             assert len(output.prompt_token_ids) == n
+            assert len(output.outputs.data) == n
             assert output.num_cached_tokens == 0
+
+        # test enable_prefix_caching plus all pooling
+        # we need to skip reading cache at this request by
+        # request.skip_reading_prefix_cache
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="token_embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert len(output.outputs.data) == n
+            assert output.num_cached_tokens == 0
+
+        # skip_reading_prefix_cache can still write to cache
+        # to accelerate following requests
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert output.num_cached_tokens > 0
diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py
index 91be6cd09d33..2482452645ef 100644
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -2,18 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.config.pooler import PoolerConfig
-from vllm.platforms import current_platform
 
 
 def test_idefics_multimodal(
     vllm_runner,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -59,13 +52,7 @@ def update_config(config):
 
 def test_gemma_multimodal(
     vllm_runner,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     messages = [
         {
             "role": "system",
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index 46504d025c26..c42186c7db9a 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -76,7 +76,6 @@ def test_prm_models(
     math_step_prompts,
     model: str,
     dtype: str,
-    monkeypatch,
 ) -> None:
     check_transformers_version(
         "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
@@ -85,11 +84,6 @@ def test_prm_models(
     if current_platform.is_cpu():
         pytest.skip("CPU only supports V1")
 
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
     with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.reward(math_step_prompts)
 
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 4c79ac318ffb..95b64b380db0 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -5,16 +5,18 @@
 """
 
 import math
-import os
 from collections import defaultdict
 from pathlib import PosixPath
 
 import pytest
+from packaging.version import Version
 from transformers import (
     AutoModel,
+    AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoModelForTextToWaveform,
 )
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
 from vllm.utils.func_utils import identity
@@ -38,13 +40,6 @@
     VLMTestType,
 )
 
-# This hack is needed for phi3v & paligemma models
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
 COMMON_BROADCAST_SETTINGS = {
     "test_type": VLMTestType.IMAGE,
     "dtype": "half",
@@ -137,6 +132,7 @@
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
         video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
+        enforce_eager=False,
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -166,6 +162,7 @@
             VLMTestType.MULTI_IMAGE,
             VLMTestType.VIDEO,
         ),
+        enforce_eager=False,
         needs_video_metadata=True,
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",  # noqa: E501
@@ -695,6 +692,23 @@
         patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
         hf_model_kwargs={"revision": "refs/pr/5"},
     ),
+    "paddleocr_vl": VLMTestInfo(
+        models=["PaddlePaddle/PaddleOCR-VL"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        img_idx_to_prompt=lambda idx: (
+            "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+        ),
+        multi_image_prompt=(
+            "Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Describe these two images separately."
+        ),
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForCausalLM,
+        image_size_factors=[(), (0.25,)],
+    ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -859,6 +873,12 @@
                 limit_mm_per_prompt={"image": 4},
             )
         ],
+        marks=[
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) == Version("4.57.1"),
+                reason="This model is broken in Transformers v4.57.1",
+            )
+        ],
     ),
     # regression test for https://github.com/vllm-project/vllm/issues/15122
     "qwen2_5_vl-windows-attention": VLMTestInfo(
diff --git a/tests/models/multimodal/generation/test_multimodal_gguf.py b/tests/models/multimodal/generation/test_multimodal_gguf.py
new file mode 100644
index 000000000000..e596b20c6302
--- /dev/null
+++ b/tests/models/multimodal/generation/test_multimodal_gguf.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Literal, NamedTuple
+
+import pytest
+from huggingface_hub import hf_hub_download
+from pytest import MarkDecorator
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.assets.image import ImageAsset
+from vllm.utils.torch_utils import set_default_torch_num_threads
+
+from ....conftest import PromptImageInput, VllmRunner
+from ...utils import check_logprobs_close
+
+
+class GGUFMMTestConfig(NamedTuple):
+    original_model: str
+    gguf_repo: str
+    gguf_backbone: str
+    gguf_mmproj: str
+    prompt: list[str]
+    mm_data: dict[Literal["images"], PromptImageInput]
+    max_model_len: int = 4096
+    marks: list[MarkDecorator] = []
+
+    @property
+    def gguf_model(self):
+        hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
+        return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
+
+
+GEMMA3_CONFIG = GGUFMMTestConfig(
+    original_model="google/gemma-3-4b-it",
+    gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
+    gguf_backbone="gemma-3-4b-it-q4_0.gguf",
+    gguf_mmproj="mmproj-model-f16-4B.gguf",
+    prompt=["<start_of_image>Describe this image in detail:"],
+    mm_data={"images": [ImageAsset("stop_sign").pil_image]},
+    marks=[pytest.mark.core_model],
+)
+
+MODELS_TO_TEST = [GEMMA3_CONFIG]
+
+
+def run_multimodal_gguf_test(
+    vllm_runner: type[VllmRunner],
+    model: GGUFMMTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    # Run gguf model.
+    with (
+        set_default_torch_num_threads(1),
+        vllm_runner(
+            model_name=model.gguf_model,
+            enforce_eager=True,
+            tokenizer_name=model.original_model,
+            dtype=dtype,
+            max_model_len=model.max_model_len,
+        ) as gguf_model,
+    ):
+        gguf_outputs = gguf_model.generate_greedy_logprobs(
+            prompts=model.prompt,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            **model.mm_data,
+        )
+
+    # Run unquantized model.
+    with vllm_runner(
+        model_name=model.original_model,
+        enforce_eager=True,  # faster tests
+        dtype=dtype,
+        max_model_len=model.max_model_len,
+    ) as original_model:
+        original_outputs = original_model.generate_greedy_logprobs(
+            prompts=model.prompt,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            **model.mm_data,
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=original_outputs,
+        outputs_1_lst=gguf_outputs,
+        name_0="original",
+        name_1="gguf",
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(test_config, marks=test_config.marks)
+        for test_config in MODELS_TO_TEST
+    ],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(
+    vllm_runner: type[VllmRunner],
+    model: GGUFMMTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    run_multimodal_gguf_test(vllm_runner, model, dtype, max_tokens, num_logprobs)
diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py
index cbc7dfca0234..62456221711e 100644
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@@ -11,7 +11,6 @@
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
-from vllm.platforms import current_platform
 
 from ....conftest import (
     IMAGE_ASSETS,
@@ -46,12 +45,6 @@
 
 target_dtype = "half"
 
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
 
 def run_test(
     hf_runner: type[HfRunner],
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index 5619cecc081d..665bed512392 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -14,7 +14,6 @@
 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import convert_image_mode, rescale_image_size
-from vllm.platforms import current_platform
 
 from ....conftest import (
     IMAGE_ASSETS,
@@ -68,12 +67,6 @@ def vllm_to_hf_output(
 
 target_dtype = "half"
 
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
 
 def run_test(
     hf_runner: type[HfRunner],
diff --git a/tests/models/multimodal/generation/test_qwen2_5_vl.py b/tests/models/multimodal/generation/test_qwen2_5_vl.py
index 1a7d854352ae..3ba665710af4 100644
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -34,6 +34,7 @@ def qwen2_5_vl_chat_template(*query):
 @pytest.mark.parametrize("num_frames", [16])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
 def test_qwen2_5_vl_evs_functionality(
     vllm_runner,
     video_assets,
@@ -42,10 +43,14 @@ def test_qwen2_5_vl_evs_functionality(
     num_frames: int,
     dtype: str,
     max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
 ) -> None:
     """Test EVS (Efficient Video Sampling) functionality with different
     pruning rates.
     """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
 
     # Sample frames from video assets
     sampled_vids = [
@@ -61,10 +66,8 @@ def test_qwen2_5_vl_evs_functionality(
         model,
         runner="generate",
         max_model_len=4000,
-        max_num_seqs=1,
         dtype=dtype,
         limit_mm_per_prompt={"video": 1},
-        tensor_parallel_size=1,
         video_pruning_rate=video_pruning_rate,
     ) as vllm_model:
         # Generate output - this should not crash
@@ -88,6 +91,7 @@ def test_qwen2_5_vl_evs_functionality(
 @pytest.mark.parametrize("num_frames", [16])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
 def test_qwen2_5_vl_evs_batched_videos(
     vllm_runner,
     video_assets,
@@ -96,6 +100,8 @@ def test_qwen2_5_vl_evs_batched_videos(
     num_frames: int,
     dtype: str,
     max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
 ) -> None:
     """Test EVS functionality with batched videos.
 
@@ -104,6 +110,8 @@ def test_qwen2_5_vl_evs_batched_videos(
     2. Both pruning configurations work with multiple videos
     3. The model doesn't crash when processing multiple videos simultaneously
     """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
     # Sample frames from video assets
     sampled_vids = [
         sample_frames_from_video(asset.np_ndarrays, num_frames)
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 5e2438857aee..3b9597507ac1 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -78,6 +78,12 @@ def gguf_model(self):
     gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
 )
 
+GEMMA3_CONFIG = GGUFTestConfig(
+    original_model="google/gemma-3-270m-it",
+    gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF",
+    gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf",
+)
+
 MODELS = [
     # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
     QWEN2_CONFIG,
@@ -85,6 +91,7 @@ def gguf_model(self):
     GPT2_CONFIG,
     STABLELM_CONFIG,
     DOLPHIN_CONFIG,
+    GEMMA3_CONFIG,
     # STARCODER_CONFIG, # broken
 ]
 
@@ -148,7 +155,7 @@ def check_model_outputs(
     "model",
     [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
 )
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("tp_size", [1])
diff --git a/tests/models/quantization/test_gpt_oss_attn_quantization.py b/tests/models/quantization/test_gpt_oss_attn_quantization.py
new file mode 100644
index 000000000000..780165ea2ba7
--- /dev/null
+++ b/tests/models/quantization/test_gpt_oss_attn_quantization.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test attention quantization of gpt-oss model.
+The qkv_proj and o_proj in self_attention can be either quantized or excluded.
+
+Run `pytest tests/models/quantization/test_gpt_oss_attn_quantization.py`.
+
+"""
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import huggingface_hub
+import lm_eval
+import pytest
+from packaging import version
+
+MODEL_NAMES = ["amd/gpt-oss-20b-customized-attention-quantization"]
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
+
+
+def has_huggingface_access(repo):
+    try:
+        huggingface_hub.list_repo_refs(repo)
+        return True
+    except huggingface_hub.errors.RepositoryNotFoundError:
+        return False
+
+
+HF_HUB_AMD_ORG_ACCESS = all(
+    [has_huggingface_access(model_name) for model_name in MODEL_NAMES]
+)
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@dataclass
+class EvaluationConfig:
+    model_name: str
+
+    def get_model_args(self) -> str:
+        return (
+            f"pretrained={self.model_name},"
+            "tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.9,trust_remote_code=False"
+        )
+
+
+EXPECTED_ACCURACIES = {"arc_challenge": 0.20}
+
+
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(
+    not HF_HUB_AMD_ORG_ACCESS,
+    reason="Read access to huggingface.co/amd is required for this test.",
+)
+@pytest.mark.parametrize("model_name", MODEL_NAMES)
+@pytest.mark.parametrize("task_name, expected_accuracy", EXPECTED_ACCURACIES.items())
+def test_gpt_oss_attention_quantization(
+    model_name: str, task_name: str, expected_accuracy: float
+):
+    measured_accuracy = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=EvaluationConfig(model_name).get_model_args(),
+        tasks=task_name,
+        batch_size="auto",
+    )["results"][task_name]["acc,none"]
+
+    rtol = 0.05
+    assert (
+        measured_accuracy - rtol < expected_accuracy
+        and measured_accuracy + rtol > expected_accuracy
+    ), f"Expected: {expected_accuracy} |  Measured: {measured_accuracy}"
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8f19a048677e..094f921e4305 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -173,6 +173,10 @@ def check_available_online(
 
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
+    "AfmoeForCausalLM": _HfExamplesInfo(
+        "arcee-ai/Trinity-Nano",
+        is_available_online=False,
+    ),
     "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
     "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
     "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),
@@ -243,7 +247,9 @@ def check_available_online(
     "FalconH1ForCausalLM": _HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"),
     "FlexOlmoForCausalLM": _HfExamplesInfo("allenai/Flex-reddit-2x7B-1T"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
-    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "Gemma2ForCausalLM": _HfExamplesInfo(
+        "google/gemma-2-9b", extras={"tiny": "google/gemma-2-2b-it"}
+    ),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
     "Gemma3nForCausalLM": _HfExamplesInfo("google/gemma-3n-E2B-it"),
     "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 48a6f34366cf..8c4bd6eaa2dd 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -93,6 +93,17 @@ def _initialize_kv_caches_v1(self, vllm_config):
             "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
         )
 
+    if model_arch == "DeepseekV32ForCausalLM":
+        from vllm.platforms import current_platform
+
+        capability = current_platform.get_device_capability()
+        if capability and capability.major < 9:
+            pytest.skip(
+                f"DeepseekV32 requires Hopper (9.0+) or Blackwell (10.0+) "
+                f"for FLASHMLA_SPARSE backend. Current device has compute "
+                f"capability {capability.major}.{capability.minor}"
+            )
+
     with (
         patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
         monkeypatch.context() as m,
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index a18f5b607763..ae5befd2c00b 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test the functionality of the Transformers backend."""
+"""Test the functionality of the Transformers modeling backend."""
 
 from typing import Any
 
@@ -85,7 +85,7 @@ def test_models(
     required = Version("5.0.0.dev")
     if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
         pytest.skip(
-            "MoE models with the Transformers backend require "
+            "MoE models with the Transformers modeling backend require "
             f"transformers>={required}, but got {installed}"
         )
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 0389e28746cb..a80617a366ca 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -27,7 +27,6 @@ def get_attn_backend_cls(
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
         use_mla,
         has_sink,
         use_sparse,
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index e7d902ed26aa..31b65189b5ec 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -141,7 +141,7 @@ def zp_valid(zp: torch.Tensor | None):
         "neuralmagic/Llama-3.2-1B-quantized.w8a8",
     ],
 )
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("num_logprobs", [10])
 @pytest.mark.parametrize(
     "use_aiter", [True, False] if current_platform.is_rocm() else [False]
@@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
             example_prompts, max_tokens, num_logprobs
         )
 
-    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+    with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs
         )
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index a3fb4a695347..1591ce1c4f5a 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
     # Test loading a quantized checkpoint
     compare_two_settings(
         "neuralmagic/Qwen2-1.5B-Instruct-FP8",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
     # Test GPTQ Marlin
     compare_two_settings(
         "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
     # Test AWQ Marlin
     compare_two_settings(
         "Qwen/Qwen2-1.5B-Instruct-AWQ",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
     # Test wNa16
     compare_two_settings(
         "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index 2a72f734e431..b992e976ac30 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -21,7 +21,7 @@
 )
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [4])
 def test_model_experts_int8_startup(
     hf_runner,
     vllm_runner,
@@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_transformers_version(on_fail="skip")
 
-    with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
+    with vllm_runner(
+        model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
+    ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index f02da2996ffe..7bcac9ad768e 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -45,10 +45,10 @@ def test_model_load_and_run(
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
-    with vllm_runner(model_id) as llm:
+    with vllm_runner(model_id, enforce_eager=True) as llm:
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         print(outputs[0][1])
 
 
@@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
 
     # `LLM.apply_model` requires pickling a function.
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
+    with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
 
         def check_model(model):
             attn = model.model.layers[0].self_attn.attn
@@ -112,7 +112,7 @@ def check_model(model):
 
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         print(outputs[0][1])
 
 
@@ -142,7 +142,10 @@ def test_load_fp16_model(
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
     with vllm_runner(
-        "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
+        "facebook/opt-125m",
+        quantization="fp8",
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
     ) as llm:
 
         def check_model(model):
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index ae9b1df3377d..4f3c52df6c28 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -26,7 +26,7 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", DTYPE)
 def test_ipex_quant(vllm_runner, model, dtype):
-    with vllm_runner(model, dtype=dtype) as llm:
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
     assert output
     print(output)
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index f009a4cfb870..d92dfaa2cc7b 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -49,4 +49,4 @@ def check_model(model):
 
         vllm_model.apply_model(check_model)
 
-        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
+        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])
diff --git a/tests/quantization/test_mixed_precision.py b/tests/quantization/test_mixed_precision.py
new file mode 100755
index 000000000000..51526470b423
--- /dev/null
+++ b/tests/quantization/test_mixed_precision.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test quark-quantized {MXFP4, FP8} mixed precision models.
+
+Run `pytest tests/quantization/test_mixed_precision.py`.
+
+"""
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import lm_eval
+import pytest
+from packaging import version
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@dataclass
+class EvaluationConfig:
+    model_name: str
+
+    def get_model_args(self) -> str:
+        return (
+            f"pretrained={self.model_name},"
+            "tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.8,trust_remote_code=False"
+        )
+
+
+TEST_CONFIGS = {
+    # Mixed-precision (AMP) model
+    # - Demonstrates end-to-end pipeline functionality
+    "amd/Qwen3-8B-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8": {"arc_challenge": 0.52, "mmlu": 0.72},
+    # Non-mixed-precision (PTQ) model
+    # - Reference for pipeline compatibility verification -> No conflicts or breakings
+    "amd/Llama-2-70b-chat-hf-FP8-MLPerf-fp8_attn_quark_format": {
+        "arc_challenge": 0.53,
+        "mmlu": 0.61,
+    },
+}
+
+
+@pytest.mark.parametrize("model_name, accuracy_numbers", TEST_CONFIGS.items())
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
+def test_mixed_precision_model_accuracies(model_name: str, accuracy_numbers: dict):
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=EvaluationConfig(model_name).get_model_args(),
+        tasks=list(accuracy_numbers.keys()),
+        batch_size=8,
+    )
+
+    rtol = 0.05
+
+    for task, expect_accuracy in accuracy_numbers.items():
+        measured_accuracy = results["results"][task]["acc,none"]
+        assert (
+            measured_accuracy - rtol < expect_accuracy
+            and measured_accuracy + rtol > expect_accuracy
+        ), f"Expected: {expect_accuracy} |  Measured: {measured_accuracy}"
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index 8abf65d29784..0298994c396f 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -88,6 +88,6 @@ def check_model(model):
         llm.apply_model(check_model)
 
         # Run a simple generation test to ensure the model works
-        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
         print(f"ModelOpt FP8 output: {output}")
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index e8ea4148585b..61efd2ce66c7 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
             "facebook/opt-125m",
             dtype=dtype,
             quantization="ptpc_fp8",
+            enforce_eager=True,
             kv_cache_dtype=kv_cache_dtype,
         )
     except AssertionError as e:
@@ -65,5 +66,5 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 0af27aff9359..334f9a65e4c0 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -8,7 +8,6 @@
 """
 
 import importlib.metadata
-import os
 from dataclasses import dataclass
 from importlib.util import find_spec
 
@@ -246,8 +245,6 @@ def test_mxfp4_gsm8k_correctness(config: AccuracyTestConfig):
     task = "gsm8k"
     rtol = 0.03
 
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
     results = lm_eval.simple_evaluate(
         model="vllm",
         model_args=config.get_model_args(tp_size=8, model_max_len=38768),
@@ -263,8 +260,6 @@ def test_mxfp4_gsm8k_correctness(config: AccuracyTestConfig):
         and measured_value + rtol > EXPECTED_VALUE
     ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
-    del os.environ["VLLM_USE_TRITON_FLASH_ATTN"]
-
 
 @pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 @pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index aeef4c2fd8a7..a09856c78559 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -7,6 +7,7 @@
 Run `pytest tests/quantization/test_register_quantization_config.py`.
 """
 
+import logging
 from typing import Any
 
 import pytest
@@ -22,8 +23,8 @@
     get_quantization_config,
     register_quantization_config,
 )
-from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig,
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,  # noqa: E501
 )
 
 
@@ -100,17 +101,22 @@ def get_quant_method(
         return None
 
 
-def test_register_quantization_config():
+def test_register_quantization_config(caplog_vllm):
     """Test register custom quantization config."""
 
     # The quantization method `custom_quant` should be registered.
     assert get_quantization_config("custom_quant") == CustomQuantConfig
 
     # The quantization method `custom_quant` is already exists,
-    # should raise an error.
-    with pytest.raises(ValueError):
+    # should raise a warning when re-registering it.
+    with caplog_vllm.at_level(logging.WARNING):
         register_quantization_config("custom_quant")(CustomQuantConfig)
 
+    assert any(
+        "The quantization method 'custom_quant' already exists" in message
+        for message in caplog_vllm.messages
+    ), "Expected a warning when re-registering custom_quant"
+
 
 @pytest.mark.parametrize(
     argnames="model",
@@ -136,5 +142,5 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=1)
         assert output
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index 82413f36e997..fb8d6130c377 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -392,7 +392,7 @@ def get_weight_attrs(model):
             assert not has_int4_preshuffled_tensor
 
         assert weight_attrs == [False, 1, 0, True]
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
         assert output
 
diff --git a/tests/rocm/aiter/test_grouped_quant.py b/tests/rocm/aiter/test_grouped_quant.py
new file mode 100644
index 000000000000..c7f0f1eda355
--- /dev/null
+++ b/tests/rocm/aiter/test_grouped_quant.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# This is a test for the AITER group_fp8_quant op.
+# It tests if the AITER op is
+# 1. correctly defined the relationship between
+#    implementation and fake function
+# 2. can be used with torch.compile
+# 3. can be used with CUDA graphs
+# This file will be skipped if AITER is not installed
+# and the platform is not ROCm.
+
+import importlib.util
+
+import pytest
+import torch
+
+# this import statement is needed to ensure the ops are registered
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.platforms import current_platform
+
+# Check if aiter package is installed
+aiter_available = importlib.util.find_spec("aiter") is not None
+
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_rocm() and aiter_available),
+    reason="AITER ops are only available on ROCm with aiter package installed",
+)
+
+
+def test_rocm_aiter_group_fp8_quant_fake_implementation():
+    """Test that the fake implementation is correctly
+    defined for torch.ops.vllm.rocm_aiter_group_fp8_quant."""
+    # Create test tensors
+    M = 128
+    N = 4096
+    group_size = 128
+
+    input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+    # Verify the op's fake implementation using torch.library.opcheck
+    # This checks that the fake function returns tensors with correct shapes and dtypes
+    torch.library.opcheck(
+        torch.ops.vllm.rocm_aiter_group_fp8_quant,
+        (input_tensor, group_size),
+        test_utils=("test_faketensor",),
+    )
+
+
+def test_rocm_aiter_group_fp8_quant_torch_compile_with_cudagraph():
+    """Test that rocm_aiter_ops.group_fp8_quant
+    with group size 128 can be used with
+    torch.compile in cudagraph mode."""
+    # Create test tensors
+    M = 128
+    N = 4096
+    group_size = 128
+
+    input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+    # Define a function that uses the op
+    def group_fp8_quant_fn(x):
+        return rocm_aiter_ops.group_fp8_quant(x, group_size)
+
+    # Compile with cudagraph mode
+    compiled_fn = torch.compile(
+        group_fp8_quant_fn,
+        fullgraph=True,
+        backend="inductor",
+        mode="reduce-overhead",
+        dynamic=False,
+    )
+
+    # Run eager mode
+    x_fp8_eager, scales_eager = group_fp8_quant_fn(input_tensor)
+
+    # Run compiled version (first run will trigger compilation)
+    x_fp8_compiled, scales_compiled = compiled_fn(input_tensor)
+
+    # Verify shapes match
+    assert x_fp8_compiled.shape == x_fp8_eager.shape
+    assert scales_compiled.shape == scales_eager.shape
+
+    # Verify expected shapes
+    assert x_fp8_compiled.shape == (M, N)
+    expected_scale_cols = (N + group_size - 1) // group_size
+    assert scales_compiled.shape == (M, expected_scale_cols)
+
+    # Verify results match
+    assert torch.allclose(
+        x_fp8_compiled.to(torch.float32),
+        x_fp8_eager.to(torch.float32),
+        rtol=1e-2,
+        atol=1e-2,
+    )
+    assert torch.allclose(scales_compiled, scales_eager, rtol=1e-3, atol=1e-3)
+
+    # Test with different input (reusing compiled graph)
+    input_tensor_2 = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+    x_fp8_eager_2, scales_eager_2 = group_fp8_quant_fn(input_tensor_2)
+    x_fp8_compiled_2, scales_compiled_2 = compiled_fn(input_tensor_2)
+
+    # Verify second run also produces correct results
+    assert torch.allclose(
+        x_fp8_compiled_2.to(torch.float32),
+        x_fp8_eager_2.to(torch.float32),
+        rtol=1e-2,
+        atol=1e-2,
+    )
+    assert torch.allclose(scales_compiled_2, scales_eager_2, rtol=1e-3, atol=1e-3)
+
+
+def test_rocm_aiter_group_fp8_quant_different_shapes():
+    """Test rocm_aiter_ops.group_fp8_quant with different input shapes."""
+    group_size = 128
+
+    test_shapes = [
+        (64, 2048),
+        (256, 8192),
+        (32, 1024),
+        (512, 4096),
+    ]
+
+    for M, N in test_shapes:
+        input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+        x_fp8, scales = rocm_aiter_ops.group_fp8_quant(input_tensor, group_size)
+
+        # Verify shapes
+        assert x_fp8.shape == (M, N)
+        expected_scale_cols = (N + group_size - 1) // group_size
+        assert scales.shape == (M, expected_scale_cols)
+
+        # Verify dtypes
+        from aiter import dtypes
+
+        assert x_fp8.dtype == dtypes.fp8
+        assert scales.dtype == torch.float32
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 87f5d40ac1da..c9d227599cde 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -4,7 +4,7 @@
 import pytest
 
 from vllm import SamplingParams
-from vllm.logprobs import FlattenLogprobs
+from vllm.logprobs import FlatLogprobs
 
 MODELS = ["distilbert/distilgpt2"]
 MAX_TOKENS = 5
@@ -16,17 +16,17 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("greedy", [True, False])
-@pytest.mark.parametrize("flatten_logprobs", [True, False])
+@pytest.mark.parametrize("flat_logprobs", [True, False])
 def test_ranks(
     vllm_runner,
     model,
     dtype,
     greedy,
-    flatten_logprobs,
+    flat_logprobs,
     example_prompts,
     monkeypatch: pytest.MonkeyPatch,
 ):
-    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "1" if flatten_logprobs else "0")
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1" if flat_logprobs else "0")
     with vllm_runner(model, dtype=dtype, max_logprobs=MAX_LOGPROBS) as vllm_model:
         tokenizer = vllm_model.llm.get_tokenizer()
         example_prompt_tokens = [tokenizer.encode(prompt) for prompt in example_prompts]
@@ -44,12 +44,8 @@ def test_ranks(
         decode_tokens, _, decode_logprobs, prompt_logprobs = result
 
         # Ensure the return type of logprobs is accurate
-        assert isinstance(
-            prompt_logprobs, FlattenLogprobs if flatten_logprobs else list
-        )
-        assert isinstance(
-            decode_logprobs, FlattenLogprobs if flatten_logprobs else list
-        )
+        assert isinstance(prompt_logprobs, FlatLogprobs if flat_logprobs else list)
+        assert isinstance(decode_logprobs, FlatLogprobs if flat_logprobs else list)
 
         ########################
         # Check prompt logprobs
diff --git a/tests/test_envs.py b/tests/test_envs.py
index 841d7945f912..6a9835a68e7e 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -36,7 +36,7 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
     # Enable envs cache and ignore ongoing environment changes
     enable_envs_cache()
 
-    # __getattr__ is not decorated with functools.cache
+    # __getattr__ is decorated with functools.cache
     assert hasattr(envs.__getattr__, "cache_info")
     start_hits = envs.__getattr__.cache_info().hits
 
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index 50a273016ab8..b1fb4e06a690 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -86,34 +86,6 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
         assert zipped["mm_processor_kwargs"] == exp_kwargs
 
 
-@pytest.mark.parametrize(
-    "model_id",
-    [
-        "facebook/opt-125m",
-    ],
-)
-@pytest.mark.parametrize(
-    "prompt",
-    [
-        {
-            "prompt": "",
-            "multi_modal_data": {"dummy": []},
-        },
-        {
-            "prompt_token_ids": [],
-            "multi_modal_data": {"dummy": []},
-        },
-    ],
-)
-def test_preprocessor_text_no_mm_inputs(model_id, prompt):
-    model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_configs(model_config)
-    input_preprocessor = InputPreprocessor(model_config, tokenizer)
-
-    with pytest.raises(ValueError, match="does not support multimodal inputs"):
-        input_preprocessor.preprocess(prompt)
-
-
 @pytest.mark.parametrize(
     "model_id",
     [
@@ -127,6 +99,13 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
         {"prompt_token_ids": []},
     ],
 )
+@pytest.mark.skip(
+    reason=(
+        "Applying huggingface processor on text inputs results in "
+        "significant performance regression for multimodal models. "
+        "See https://github.com/vllm-project/vllm/issues/26320"
+    )
+)
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
     tokenizer = init_tokenizer_from_configs(model_config)
diff --git a/tests/test_logprobs.py b/tests/test_logprobs.py
index 1799d3638178..d26a460d2bca 100644
--- a/tests/test_logprobs.py
+++ b/tests/test_logprobs.py
@@ -5,7 +5,7 @@
 import pytest
 
 from vllm.logprobs import (
-    FlattenLogprobs,
+    FlatLogprobs,
     Logprob,
     LogprobsOnePosition,
     append_logprobs_for_next_position,
@@ -14,8 +14,8 @@
 )
 
 
-def test_create_logprobs_non_flatten(monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "0")
+def test_create_logprobs_non_flat(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
 
     prompt_logprobs = create_prompt_logprobs()
     assert isinstance(prompt_logprobs, list)
@@ -28,11 +28,11 @@ def test_create_logprobs_non_flatten(monkeypatch: pytest.MonkeyPatch) -> None:
     assert len(sample_logprobs) == 0
 
 
-def test_create_logprobs_flatten(monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "1")
+def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
 
     prompt_logprobs = create_prompt_logprobs()
-    assert isinstance(prompt_logprobs, FlattenLogprobs)
+    assert isinstance(prompt_logprobs, FlatLogprobs)
     assert prompt_logprobs.start_indices == [0]
     assert prompt_logprobs.end_indices == [0]
     assert len(prompt_logprobs.token_ids) == 0
@@ -44,7 +44,7 @@ def test_create_logprobs_flatten(monkeypatch: pytest.MonkeyPatch) -> None:
     assert prompt_logprobs[0] == dict()
 
     sample_logprobs = create_sample_logprobs()
-    assert isinstance(sample_logprobs, FlattenLogprobs)
+    assert isinstance(sample_logprobs, FlatLogprobs)
     assert len(sample_logprobs.start_indices) == 0
     assert len(sample_logprobs.end_indices) == 0
     assert len(sample_logprobs.token_ids) == 0
@@ -54,10 +54,10 @@ def test_create_logprobs_flatten(monkeypatch: pytest.MonkeyPatch) -> None:
     assert len(sample_logprobs) == 0
 
 
-def test_append_logprobs_for_next_position_none_flatten(
+def test_append_logprobs_for_next_position_none_flat(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "0")
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
     logprobs = create_sample_logprobs()
     append_logprobs_for_next_position(
         logprobs,
@@ -85,10 +85,10 @@ def test_append_logprobs_for_next_position_none_flatten(
     ]
 
 
-def test_append_logprobs_for_next_position_flatten(
+def test_append_logprobs_for_next_position_flat(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "1")
+    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
     logprobs = create_sample_logprobs()
     append_logprobs_for_next_position(
         logprobs,
@@ -106,7 +106,7 @@ def test_append_logprobs_for_next_position_flatten(
         rank=11,
         num_logprobs=-1,
     )
-    assert isinstance(logprobs, FlattenLogprobs)
+    assert isinstance(logprobs, FlatLogprobs)
     assert logprobs.start_indices == [0, 1]
     assert logprobs.end_indices == [1, 3]
     assert logprobs.token_ids == [1, 2, 3]
@@ -129,8 +129,8 @@ def test_append_logprobs_for_next_position_flatten(
 }
 
 
-def test_flatten_logprobs_append() -> None:
-    logprobs = FlattenLogprobs()
+def test_flat_logprobs_append() -> None:
+    logprobs = FlatLogprobs()
     logprobs.append(LOGPROBS_ONE_POSITION_0)
     logprobs.append(LOGPROBS_ONE_POSITION_1)
     assert logprobs.start_indices == [0, 1]
@@ -149,8 +149,8 @@ def test_flatten_logprobs_append() -> None:
     assert logprobs.decoded_tokens == ["10", "20", "30", "40", "50", "60"]
 
 
-def test_flatten_logprobs_extend() -> None:
-    logprobs = FlattenLogprobs()
+def test_flat_logprobs_extend() -> None:
+    logprobs = FlatLogprobs()
     # Extend with list[LogprobsOnePosition]
     logprobs.extend([LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0])
     assert logprobs.start_indices == [0, 3]
@@ -160,9 +160,9 @@ def test_flatten_logprobs_extend() -> None:
     assert logprobs.ranks == [40, 50, 60, 10]
     assert logprobs.decoded_tokens == ["40", "50", "60", "10"]
 
-    other_logprobs = FlattenLogprobs()
+    other_logprobs = FlatLogprobs()
     other_logprobs.extend([LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_0])
-    # Extend with another FlattenLogprobs
+    # Extend with another FlatLogprobs
     logprobs.extend(other_logprobs)
     assert logprobs.start_indices == [0, 3, 4, 6]
     assert logprobs.end_indices == [3, 4, 6, 7]
@@ -172,8 +172,8 @@ def test_flatten_logprobs_extend() -> None:
     assert logprobs.decoded_tokens == ["40", "50", "60", "10", "20", "30", "10"]
 
 
-def test_flatten_logprobs_access() -> None:
-    logprobs = FlattenLogprobs()
+def test_flat_logprobs_access() -> None:
+    logprobs = FlatLogprobs()
     logprobs.extend(
         [LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0]
     )
diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index c358589dbc29..3a48b5206141 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -60,6 +60,11 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
     ids=[
         "tool_call_with_content_before",
         "multi_tool_call_with_content_before",
+        "concatenated_tool_calls_bug_fix",
+        "three_concatenated_tool_calls",
+        "mixed_spacing_tool_calls",
+        "angle_brackets_in_json",
+        "newlines_in_json",
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -114,6 +119,123 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
             ],
             "I'll help you check the weather. ",
         ),
+        (
+            """I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"latitude": 34.0522, "longitude": -118.2437}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"content": "Los Angeles today"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {"latitude": 34.0522, "longitude": -118.2437}
+                        ),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_news:1",
+                    function=FunctionCall(
+                        name="get_news",
+                        arguments=json.dumps({"content": "Los Angeles today"}),
+                    ),
+                    type="function",
+                ),
+            ],
+            "I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. ",
+        ),
+        (
+            """I'll help you with multiple tasks. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"city": "New York"}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"topic": "technology"}<|tool_call_end|><|tool_call_begin|>functions.send_email:2<|tool_call_argument_begin|>{"to": "user@example.com", "subject": "Daily Update"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps({"city": "New York"}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_news:1",
+                    function=FunctionCall(
+                        name="get_news",
+                        arguments=json.dumps({"topic": "technology"}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.send_email:2",
+                    function=FunctionCall(
+                        name="send_email",
+                        arguments=json.dumps(
+                            {"to": "user@example.com", "subject": "Daily Update"}
+                        ),
+                    ),
+                    type="function",
+                ),
+            ],
+            "I'll help you with multiple tasks. ",
+        ),
+        (
+            """Mixed spacing test. <|tool_calls_section_begin|> <|tool_call_begin|> functions.test:0 <|tool_call_argument_begin|> {} <|tool_call_end|><|tool_call_begin|>functions.test2:1<|tool_call_argument_begin|>{}<|tool_call_end|> <|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.test:0",
+                    function=FunctionCall(
+                        name="test",
+                        arguments=json.dumps({}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.test2:1",
+                    function=FunctionCall(
+                        name="test2",
+                        arguments=json.dumps({}),
+                    ),
+                    type="function",
+                ),
+            ],
+            "Mixed spacing test. ",
+        ),
+        (
+            """I need to process HTML content. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_html:0<|tool_call_argument_begin|>{"html": "<div>content</div>", "text": "normal text"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.process_html:0",
+                    function=FunctionCall(
+                        name="process_html",
+                        arguments=json.dumps(
+                            {"html": "<div>content</div>", "text": "normal text"}
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            "I need to process HTML content. ",
+        ),
+        (
+            """I need to process formatted JSON. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_data:0<|tool_call_argument_begin|>{
+  "name": "test",
+  "value": 123,
+  "nested": {
+    "key": "value"
+  }
+}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.process_data:0",
+                    function=FunctionCall(
+                        name="process_data",
+                        arguments=json.dumps(
+                            {"name": "test", "value": 123, "nested": {"key": "value"}},
+                            indent=2,
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            "I need to process formatted JSON. ",
+        ),
     ],
 )
 def test_extract_tool_calls(
@@ -209,3 +331,596 @@ def test_streaming_no_tool_calls(kimi_k2_tool_parser):
     assert result is not None
     assert hasattr(result, "content")
     assert result.content == " without any tool calls."
+
+
+def test_token_leak_between_section_and_tool_begin(kimi_k2_tool_parser):
+    """
+    Test that text between <|tool_calls_section_begin|> and <|tool_call_begin|>
+    is suppressed and does not leak into reasoning_delta.
+    This is the main vulnerability being fixed.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Get token IDs for the markers
+    section_begin_token_id = kimi_k2_tool_parser.vocab.get(
+        "<|tool_calls_section_begin|>"
+    )
+    tool_call_begin_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+
+    # Simulate streaming sequence:
+    # Delta 1: "I'll help you with that. "
+    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="I'll help you with that. ",
+        delta_text="I'll help you with that. ",
+        previous_token_ids=[],
+        current_token_ids=[1, 2, 3],  # Regular tokens
+        delta_token_ids=[1, 2, 3],
+        request=None,
+    )
+    assert result1 is not None
+    assert result1.content == "I'll help you with that. "
+
+    # Delta 2: "<|tool_calls_section_begin|>"
+    prev_ids = [1, 2, 3]
+    curr_ids = prev_ids + [section_begin_token_id]
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. ",
+        current_text="I'll help you with that. <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[section_begin_token_id],
+        request=None,
+    )
+    # Section marker should be stripped and suppressed
+    assert result2 is None or (result2.content is None or result2.content == "")
+
+    # Delta 3: " spurious text or tokens " (THE LEAK SCENARIO)
+    prev_ids = curr_ids
+    curr_ids = curr_ids + [4, 5]
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. <|tool_calls_section_begin|>",
+        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
+        delta_text=" spurious text ",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[4, 5],
+        request=None,
+    )
+    # CRITICAL: This text should be suppressed, NOT returned as reasoning_delta
+    assert result3 is None or (result3.content is None or result3.content == "")
+
+    # Delta 4: "<|tool_call_begin|>..."
+    prev_ids = curr_ids
+    curr_ids = curr_ids + [tool_call_begin_token_id]
+    _result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
+        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text <|tool_call_begin|>",
+        delta_text="<|tool_call_begin|>",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[tool_call_begin_token_id],
+        request=None,
+    )
+    # Now we're in tool call mode, result depends on internal state
+    # The key is that the spurious text from Delta 3 was not leaked
+
+
+def test_split_markers_across_deltas(kimi_k2_tool_parser):
+    """
+    Test that markers split across delta chunks are correctly detected
+    via the rolling buffer mechanism.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_token_id = kimi_k2_tool_parser.vocab.get(
+        "<|tool_calls_section_begin|>"
+    )
+
+    # Delta 1: "...reasoning<|tool_calls_sec"
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning",
+        current_text="Some reasoning<|tool_calls_sec",
+        delta_text="<|tool_calls_sec",
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, 3],  # Partial token
+        delta_token_ids=[3],
+        request=None,
+    )
+    # Partial token not recognized yet, might be buffered
+    # Should return as content or None (depends on implementation)
+
+    # Delta 2: "tion_begin|> "  (completes the marker)
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning<|tool_calls_sec",
+        current_text="Some reasoning<|tool_calls_section_begin|> ",
+        delta_text="tion_begin|> ",
+        previous_token_ids=[1, 2, 3],
+        current_token_ids=[1, 2, section_begin_token_id, 4],
+        delta_token_ids=[section_begin_token_id, 4],
+        request=None,
+    )
+    # Now the complete marker should be detected via buffer
+    # The parser should enter tool section mode
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+
+def test_marker_variants(kimi_k2_tool_parser):
+    """Test that both singular and plural marker variants are recognized."""
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Test singular variant: <|tool_call_section_begin|> (note: singular "call")
+    singular_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_section_begin|>")
+
+    if singular_token_id is not None:  # Only test if tokenizer supports it
+        _result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+            previous_text="Reasoning ",
+            current_text="Reasoning <|tool_call_section_begin|>",
+            delta_text="<|tool_call_section_begin|>",
+            previous_token_ids=[1, 2],
+            current_token_ids=[1, 2, singular_token_id],
+            delta_token_ids=[singular_token_id],
+            request=None,
+        )
+        # Should enter tool section mode with singular variant too
+        assert kimi_k2_tool_parser.in_tool_section is True
+
+
+def test_reentry_to_reasoning_after_tool_section(kimi_k2_tool_parser):
+    """
+    Test that after exiting a tool section with <|tool_calls_section_end|>,
+    subsequent text is correctly returned as reasoning content.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Exit tool section
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
+        delta_text="<|tool_calls_section_end|>",
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id, section_end_id],
+        delta_token_ids=[section_end_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Subsequent reasoning text should be returned normally
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
+        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|> More reasoning",
+        delta_text=" More reasoning",
+        previous_token_ids=[section_begin_id, section_end_id],
+        current_token_ids=[section_begin_id, section_end_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+    assert result3 is not None
+    assert result3.content == " More reasoning"
+
+
+def test_empty_tool_section(kimi_k2_tool_parser):
+    """Test an empty tool section (begin immediately followed by end)."""
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Section begin
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[1],
+        current_token_ids=[1, section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+
+    # Immediate section end
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning <|tool_calls_section_begin|>",
+        current_text="Reasoning <|tool_calls_section_begin|><|tool_calls_section_end|>",
+        delta_text="<|tool_calls_section_end|>",
+        previous_token_ids=[1, section_begin_id],
+        current_token_ids=[1, section_begin_id, section_end_id],
+        delta_token_ids=[section_end_id],
+        request=None,
+    )
+    # Should exit cleanly without errors
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+
+def test_malformed_tool_section_recovery(kimi_k2_tool_parser):
+    """
+    Test that the parser recovers from a malformed tool section
+    that never closes properly.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Simulate a lot of text without proper tool calls or section end
+    # This should trigger the error recovery mechanism
+    large_text = "x" * 10000  # Exceeds max_section_chars
+
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|>" + large_text,
+        delta_text=large_text,
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id] + list(range(100, 100 + len(large_text))),
+        delta_token_ids=list(range(100, 100 + len(large_text))),
+        request=None,
+    )
+
+    # Parser should have force-exited the tool section
+    assert kimi_k2_tool_parser.in_tool_section is False
+    # And returned the content as reasoning
+    assert result2 is not None
+    assert result2.content == large_text
+
+
+def test_state_reset(kimi_k2_tool_parser):
+    """Test that reset_streaming_state() properly clears all state."""
+    # Put parser in a complex state
+    kimi_k2_tool_parser.in_tool_section = True
+    kimi_k2_tool_parser.token_buffer = "some buffer"
+    kimi_k2_tool_parser.current_tool_id = 5
+    kimi_k2_tool_parser.prev_tool_call_arr = [{"id": "test"}]
+    kimi_k2_tool_parser.section_char_count = 1000
+
+    # Reset
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Verify all state is cleared
+    assert kimi_k2_tool_parser.in_tool_section is False
+    assert kimi_k2_tool_parser.token_buffer == ""
+    assert kimi_k2_tool_parser.current_tool_id == -1
+    assert kimi_k2_tool_parser.prev_tool_call_arr == []
+    assert kimi_k2_tool_parser.section_char_count == 0
+    assert kimi_k2_tool_parser.current_tool_name_sent is False
+    assert kimi_k2_tool_parser.streamed_args_for_tool == []
+
+
+def test_section_begin_noise_tool_begin_same_chunk(kimi_k2_tool_parser):
+    """
+    Test that begin→noise→tool_begin within the SAME chunk suppresses
+    the noise text correctly (not just across chunks).
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    tool_call_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+
+    # Single delta containing: section_begin + spurious text + tool_call_begin
+    combined_text = "<|tool_calls_section_begin|> noise text <|tool_call_begin|>"
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning " + combined_text,
+        delta_text=combined_text,
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id, 3, 4, tool_call_begin_id],
+        delta_token_ids=[section_begin_id, 3, 4, tool_call_begin_id],
+        request=None,
+    )
+
+    # The noise text should NOT leak into content
+    # Result should either be None/empty or start tool call parsing
+    if result is not None and result.content is not None:
+        # If content is returned, it should not contain the noise
+        assert "noise text" not in result.content
+        assert result.content == "" or result.content.strip() == ""
+
+
+def test_stream_ends_without_section_end_marker(kimi_k2_tool_parser):
+    """
+    Test that if the stream ends (EOF) without a proper section end marker,
+    the parser doesn't leak text, doesn't crash, and resets state cleanly.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Some content in tool section
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|> partial content",
+        delta_text=" partial content",
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+    # Content should be suppressed
+    assert result2.content == "" or result2.content is None
+
+    # Stream ends (EOF) - no more deltas, no section_end marker
+    # Simulate this by manually checking state and resetting
+    # (In real usage, the request handler would call reset_streaming_state)
+    assert kimi_k2_tool_parser.in_tool_section is True  # Still in section
+
+    # Reset state (as would happen between requests)
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Verify clean slate
+    assert kimi_k2_tool_parser.in_tool_section is False
+    assert kimi_k2_tool_parser.token_buffer == ""
+
+    # Next request should work normally
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="New reasoning",
+        delta_text="New reasoning",
+        previous_token_ids=[],
+        current_token_ids=[20, 21],
+        delta_token_ids=[20, 21],
+        request=None,
+    )
+    assert result3 is not None
+    assert result3.content == "New reasoning"
+
+
+def test_same_chunk_begin_and_end_markers(kimi_k2_tool_parser):
+    """
+    CRITICAL TEST: Verify that when both section_begin and section_end
+    markers appear in the SAME chunk, the parser correctly:
+    1. Enters the tool section
+    2. Immediately exits the tool section
+    3. Does NOT get stuck in in_tool_section=True state
+
+    This tests the bug fix where elif was changed to if to handle
+    both state transitions in a single delta.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Single chunk with both markers (e.g., empty tool section)
+    combined_delta = "<|tool_calls_section_begin|><|tool_calls_section_end|>"
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning ",
+        current_text="Some reasoning " + combined_delta,
+        delta_text=combined_delta,
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id, section_end_id],
+        delta_token_ids=[section_begin_id, section_end_id],
+        request=None,
+    )
+
+    # CRITICAL: Parser should NOT be stuck in tool section
+    assert kimi_k2_tool_parser.in_tool_section is False, (
+        "Parser stuck in tool section after processing both begin/end in same chunk. "
+        "This indicates the elif bug was not fixed."
+    )
+
+    # Result should be empty or contain only stripped content
+    assert result is not None
+    assert result.content == "" or result.content is None
+
+    # Verify subsequent content streams correctly (not suppressed)
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning " + combined_delta,
+        current_text="Some reasoning " + combined_delta + " More reasoning",
+        delta_text=" More reasoning",
+        previous_token_ids=[1, 2, section_begin_id, section_end_id],
+        current_token_ids=[1, 2, section_begin_id, section_end_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+
+    # This content should NOT be suppressed (we're out of tool section)
+    assert result2 is not None
+    assert result2.content == " More reasoning"
+
+
+def test_same_chunk_begin_content_end_markers(kimi_k2_tool_parser):
+    """
+    Test the same-chunk scenario with actual content between markers.
+    Example: <|tool_calls_section_begin|> text <|tool_calls_section_end|>
+    all arriving in one delta. The key is that the state machine correctly
+    transitions in and out within the same chunk.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Chunk with begin, some whitespace/noise, and end all together
+    # This simulates a tool section that opens and closes in the same chunk
+    combined_delta = "<|tool_calls_section_begin|>   <|tool_calls_section_end|>"
+
+    _result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning " + combined_delta,
+        delta_text=combined_delta,
+        previous_token_ids=[1],
+        current_token_ids=[1, section_begin_id, 100, section_end_id],
+        delta_token_ids=[section_begin_id, 100, section_end_id],
+        request=None,
+    )
+
+    # Parser should exit cleanly (not stuck in tool section)
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Verify the fix: next content should stream normally, not be suppressed
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning " + combined_delta,
+        current_text="Reasoning " + combined_delta + " Done",
+        delta_text=" Done",
+        previous_token_ids=[1, section_begin_id, 100, section_end_id],
+        current_token_ids=[1, section_begin_id, 100, section_end_id, 200],
+        delta_token_ids=[200],
+        request=None,
+    )
+
+    # Content after section should be returned (not suppressed)
+    assert result2 is not None
+    assert result2.content == " Done"
+
+
+def test_tool_call_end_and_section_end_same_chunk(kimi_k2_tool_parser):
+    """
+    CRITICAL TEST (P1): Verify that when both <|tool_call_end|> and
+    <|tool_calls_section_end|> appear in the SAME chunk, the parser:
+    1. Processes the tool_call_end first (emits final arguments)
+    2. THEN exits the section
+    3. Does NOT drop the final tool call update
+    4. Does NOT leak special tokens into reasoning
+
+    This tests the deferred section exit fix.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+    tool_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+    tool_end_id = kimi_k2_tool_parser.vocab.get("<|tool_call_end|>")
+
+    # Simulate a streaming sequence for a SHORT tool call (all in one chunk):
+    # 1. Reasoning text
+    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="Let me help. ",
+        delta_text="Let me help. ",
+        previous_token_ids=[],
+        current_token_ids=[1, 2],
+        delta_token_ids=[1, 2],
+        request=None,
+    )
+    assert result1 is not None
+    assert result1.content == "Let me help. "
+
+    # 2. Section begin
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Let me help. ",
+        current_text="Let me help. <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # 3. Tool call begin + full content + tool_end + section_end ALL IN ONE CHUNK
+    # This is the critical scenario for short tool calls
+    combined = (
+        '<|tool_call_begin|>get_weather:0 <|tool_call_argument_begin|> {"city": "Paris"} '
+        "<|tool_call_end|><|tool_calls_section_end|>"
+    )
+
+    # Build up the previous text gradually to simulate realistic streaming
+    prev_text = "Let me help. <|tool_calls_section_begin|>"
+    curr_text = prev_text + combined
+
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text=prev_text,
+        current_text=curr_text,
+        delta_text=combined,
+        previous_token_ids=[1, 2, section_begin_id],
+        current_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+        ],
+        delta_token_ids=[tool_begin_id, 10, 11, 12, tool_end_id, section_end_id],
+        request=None,
+    )
+
+    # CRITICAL: Parser should have exited section AFTER processing tool
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Tool call should have been emitted (not dropped)
+    # The result might be the tool name or None depending on state, but
+    # importantly, it shouldn't be returning the literal tokens as content
+
+    if result3 is not None and result3.content is not None:
+        # Verify no special tokens leaked into content
+        assert "<|tool_call_end|>" not in result3.content
+        assert "<|tool_calls_section_end|>" not in result3.content
+
+    # 4. Verify subsequent content streams normally
+    result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text=curr_text,
+        current_text=curr_text + " Done",
+        delta_text=" Done",
+        previous_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+        ],
+        current_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+            20,
+        ],
+        delta_token_ids=[20],
+        request=None,
+    )
+
+    # Content after tool section should stream normally
+    assert result4 is not None
+    assert result4.content == " Done"
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 51684edcc8a3..3310753d2b6d 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -8,6 +8,7 @@
 import pytest
 import yaml
 from transformers import AutoTokenizer
+from pydantic import ValidationError
 
 from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
 
@@ -376,6 +377,65 @@ def test_load_config_file(tmp_path):
     os.remove(str(config_file_path))
 
 
+def test_compilation_mode_string_values(parser):
+    """Test that -O.mode accepts both integer and string mode values."""
+    args = parser.parse_args(["-O.mode", "0"])
+    assert args.compilation_config == {"mode": 0}
+
+    args = parser.parse_args(["-O3"])
+    assert args.compilation_config == {"mode": 3}
+
+    args = parser.parse_args(["-O.mode=NONE"])
+    assert args.compilation_config == {"mode": "NONE"}
+
+    args = parser.parse_args(["-O.mode", "STOCK_TORCH_COMPILE"])
+    assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"}
+
+    args = parser.parse_args(["-O.mode=DYNAMO_TRACE_ONCE"])
+    assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"}
+
+    args = parser.parse_args(["-O.mode", "VLLM_COMPILE"])
+    assert args.compilation_config == {"mode": "VLLM_COMPILE"}
+
+    args = parser.parse_args(["-O.mode=none"])
+    assert args.compilation_config == {"mode": "none"}
+
+    args = parser.parse_args(["-O.mode=vllm_compile"])
+    assert args.compilation_config == {"mode": "vllm_compile"}
+
+
+def test_compilation_config_mode_validator():
+    """Test that CompilationConfig.mode field validator converts strings to integers."""
+    from vllm.config.compilation import CompilationConfig, CompilationMode
+
+    config = CompilationConfig(mode=0)
+    assert config.mode == CompilationMode.NONE
+
+    config = CompilationConfig(mode=3)
+    assert config.mode == CompilationMode.VLLM_COMPILE
+
+    config = CompilationConfig(mode="NONE")
+    assert config.mode == CompilationMode.NONE
+
+    config = CompilationConfig(mode="STOCK_TORCH_COMPILE")
+    assert config.mode == CompilationMode.STOCK_TORCH_COMPILE
+
+    config = CompilationConfig(mode="DYNAMO_TRACE_ONCE")
+    assert config.mode == CompilationMode.DYNAMO_TRACE_ONCE
+
+    config = CompilationConfig(mode="VLLM_COMPILE")
+    assert config.mode == CompilationMode.VLLM_COMPILE
+
+    config = CompilationConfig(mode="none")
+    assert config.mode == CompilationMode.NONE
+
+    config = CompilationConfig(mode="vllm_compile")
+    assert config.mode == CompilationMode.VLLM_COMPILE
+
+    with pytest.raises(ValidationError, match="Invalid compilation mode"):
+        CompilationConfig(mode="INVALID_MODE")
+
+
 def test_flat_product():
     # Check regular itertools.product behavior
     result1 = list(flat_product([1, 2, 3], ["a", "b"]))
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index 08aeb6f298f6..b46002c5fa8f 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -15,7 +15,7 @@
     create_vllm_config,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import ModelConfig
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
@@ -27,11 +27,11 @@
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN,
-    _Backend.FLASHINFER,
-    _Backend.FLEX_ATTENTION,
-    _Backend.TRITON_ATTN,
-    _Backend.TREE_ATTN,
+    AttentionBackendEnum.FLASH_ATTN,
+    AttentionBackendEnum.FLASHINFER,
+    AttentionBackendEnum.FLEX_ATTENTION,
+    AttentionBackendEnum.TRITON_ATTN,
+    AttentionBackendEnum.TREE_ATTN,
     "FLEX_ATTENTION_SLOW",
 ]
 
@@ -39,7 +39,7 @@
 try:
     import flashinfer  # noqa: F401
 except ImportError:
-    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASHINFER)
 
 
 def _convert_dtype_to_torch(dtype):
@@ -192,7 +192,7 @@ def __init__(self, device: torch.device):
 
 
 def run_attention_backend(
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     kv_cache_spec: FullAttentionSpec,
     layer_names: list[str],
     vllm_config,
@@ -211,13 +211,13 @@ def run_attention_backend(
 
     use_direct_block_mask = is_torch_equal_or_newer("2.9.0.dev0")
     if backend == "FLEX_ATTENTION_SLOW":
-        actual_backend = _Backend.FLEX_ATTENTION
+        actual_backend = AttentionBackendEnum.FLEX_ATTENTION
         use_direct_block_mask = False
 
     builder_cls, impl_cls = try_get_attention_backend(actual_backend)
 
     # Mock flashinfer's get_per_layer_parameters if needed
-    if actual_backend == _Backend.FLASHINFER:
+    if actual_backend == AttentionBackendEnum.FLASHINFER:
         import unittest.mock
 
         from vllm.v1.attention.backends.utils import PerLayerParameters
@@ -246,7 +246,7 @@ def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
     else:
         # Build metadata
         builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
-        if actual_backend == _Backend.FLEX_ATTENTION:
+        if actual_backend == AttentionBackendEnum.FLEX_ATTENTION:
             builder.direct_build = use_direct_block_mask
         attn_metadata = builder.build(
             common_prefix_len=0,
@@ -289,7 +289,7 @@ def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
 def _test_backend_correctness(
     batch_spec: BatchSpec,
     model: str,
-    backend_to_test: list[_Backend | str],
+    backend_to_test: list[AttentionBackendEnum | str],
     mask_mod,
     *,
     block_size: int = 16,
@@ -455,17 +455,20 @@ def _test_backend_correctness(
         # Select the appropriate KV cache format for each backend
         kv_cache_for_backend = kv_cache
         reset_kv_cache_layout = False
-        if backend_name in (_Backend.FLASHINFER, _Backend.TRITON_ATTN):
+        if backend_name in (
+            AttentionBackendEnum.FLASHINFER,
+            AttentionBackendEnum.TRITON_ATTN,
+        ):
             kv_cache_for_backend = kv_cache.transpose(0, 1)
 
-        if backend_name == _Backend.FLASHINFER:
+        if backend_name == AttentionBackendEnum.FLASHINFER:
             # For FlashInfer default to HND layout and
             kv_cache_for_backend = (
                 kv_cache_for_backend.transpose(2, 3).contiguous().transpose(2, 3)
             )
             set_kv_cache_layout("HND")
             reset_kv_cache_layout = True
-        elif backend_name == _Backend.TRITON_ATTN:
+        elif backend_name == AttentionBackendEnum.TRITON_ATTN:
             kv_cache_for_backend = kv_cache_for_backend.contiguous()
 
         try:
@@ -547,7 +550,9 @@ def causal_mask_mod(
 
     batch_spec = BATCH_SPECS[batch_spec_name]
     LARGE_BLOCK_BACKENDS = (
-        [_Backend.FLEX_ATTENTION] if is_torch_equal_or_newer("2.9.0.dev0") else []
+        [AttentionBackendEnum.FLEX_ATTENTION]
+        if is_torch_equal_or_newer("2.9.0.dev0")
+        else []
     )
     SMALL_BLOCK_BACKENDS = [
         x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
@@ -573,9 +578,9 @@ def causal_mask_mod(
 
 
 SLIDING_WINDOW_BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN,
-    _Backend.FLEX_ATTENTION,
-    _Backend.TRITON_ATTN,
+    AttentionBackendEnum.FLASH_ATTN,
+    AttentionBackendEnum.FLEX_ATTENTION,
+    AttentionBackendEnum.TRITON_ATTN,
     "FLEX_ATTENTION_SLOW",
 ]
 
@@ -612,7 +617,9 @@ def sliding_window_mask_mod(
     )
 
     LARGE_BLOCK_BACKENDS = (
-        [_Backend.FLEX_ATTENTION] if is_torch_equal_or_newer("2.9.0.dev0") else []
+        [AttentionBackendEnum.FLEX_ATTENTION]
+        if is_torch_equal_or_newer("2.9.0.dev0")
+        else []
     )
     SMALL_BLOCK_BACKENDS = [
         x for x in SLIDING_WINDOW_BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 5679fafe63ee..1bd05e6183dc 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -18,12 +18,11 @@
     try_get_attention_backend,
 )
 from vllm import _custom_ops as ops
-from vllm.attention.backends.registry import _Backend, backend_to_class_str
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.ops.flashmla import is_flashmla_dense_supported
 from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.config.vllm import set_current_vllm_config
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.attention.backends.mla.common import QueryLenSupport
@@ -31,25 +30,25 @@
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
-    _Backend.CUTLASS_MLA,
-    _Backend.FLASHMLA,
-    _Backend.FLASH_ATTN_MLA,
-    _Backend.FLASHINFER_MLA,
-    _Backend.TRITON_MLA,
+    AttentionBackendEnum.CUTLASS_MLA,
+    AttentionBackendEnum.FLASHMLA,
+    AttentionBackendEnum.FLASH_ATTN_MLA,
+    AttentionBackendEnum.FLASHINFER_MLA,
+    AttentionBackendEnum.TRITON_MLA,
 ]
 
 # Remove sm100 backends from the list if not using sm100
 if not torch.cuda.is_available() or torch.cuda.get_device_properties(0).major < 10:
-    BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA)
-    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_MLA)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.CUTLASS_MLA)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASHINFER_MLA)
 
 # Remove FLASH_ATTN_MLA from the list if not supported
 if not flash_attn_supports_mla():
-    BACKENDS_TO_TEST.remove(_Backend.FLASH_ATTN_MLA)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASH_ATTN_MLA)
 
 # Remove FLASHMLA from the list if not supported
 if not is_flashmla_dense_supported()[0]:
-    BACKENDS_TO_TEST.remove(_Backend.FLASHMLA)
+    BACKENDS_TO_TEST.remove(AttentionBackendEnum.FLASHMLA)
 
 SPEC_DECODE_BACKENDS = []
 for backend in BACKENDS_TO_TEST:
@@ -62,9 +61,7 @@
 
 BACKEND_BLOCK_SIZES = {}
 for backend in BACKENDS_TO_TEST:
-    backend_class_str = backend_to_class_str(backend)
-    backend_class = resolve_obj_by_qualname(backend_class_str)
-    supported_sizes = backend_class.get_supported_kernel_block_size()
+    supported_sizes = backend.get_class().supported_kernel_block_sizes
     if supported_sizes:
         default_size = supported_sizes[0]
         block_size = (
@@ -291,7 +288,7 @@ def get_kv_cache_spec(self, vllm_config):
 
 
 def run_attention_backend(
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     kv_cache_spec: FullAttentionSpec,
     layer_names: list[str],
     vllm_config,
@@ -813,7 +810,7 @@ def test_backend_correctness(
         # Create a summary for the single-line failure message
         backend_names = []
         for f in failures:
-            if "[_Backend." in f:
+            if "[AttentionBackendEnum." in f:
                 backend_name = f.split("[")[1].split("]")[0]
                 backend_names.append(backend_name)
 
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index b166d9d4ff68..dea89babd4b4 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -8,7 +8,7 @@
 import torch
 
 from vllm.attention.backends.abstract import AttentionImpl
-from vllm.attention.backends.registry import _Backend, backend_to_class_str
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     CompilationConfig,
@@ -20,7 +20,6 @@
     VllmConfig,
 )
 from vllm.config.model import ModelDType
-from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
@@ -120,15 +119,14 @@ def create_common_attn_metadata(
 
 
 def try_get_attention_backend(
-    backend: _Backend,
+    backend: AttentionBackendEnum,
 ) -> tuple[type[AttentionMetadataBuilder], type[AttentionImpl]]:
     """Try to get the attention backend class, skipping test if not found."""
-    backend_class_str = backend_to_class_str(backend)
     try:
-        backend_class = resolve_obj_by_qualname(backend_class_str)
+        backend_class = backend.get_class()
         return backend_class.get_builder_cls(), backend_class.get_impl_cls()
     except ImportError as e:
-        pytest.skip(f"{backend_class_str} not available: {e}")
+        pytest.skip(f"{backend.name} not available: {e}")
         raise AssertionError("unreachable") from None
 
 
diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index e0645ed43015..1d80ee987591 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import deque
 
+import numpy as np
 import pytest
 
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -21,7 +22,7 @@ def _make_model_runner_output(
     return ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
-        sampled_token_ids=[[i] for i in range(len(req_ids))],
+        sampled_token_ids=[np.array([i]) for i in range(len(req_ids))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index df6a5f109874..24611a4aaa1b 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -55,7 +55,7 @@
 def _auto_init_hash_fn(request):
     hash_fn: Callable
     if "hash_fn" in request.fixturenames:
-        hash_fn = init_none_hash(request.getfixturevalue("hash_fn"))
+        hash_fn = request.getfixturevalue("hash_fn")
     else:
         hash_fn = sha256
     init_none_hash(hash_fn)
diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py
new file mode 100644
index 000000000000..ba0b703302e3
--- /dev/null
+++ b/tests/v1/core/test_priority_scheduler_random.py
@@ -0,0 +1,254 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import uuid
+
+import numpy as np
+import pytest
+
+from vllm.config import VllmConfig
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalKwargsItem,
+    PlaceholderRange,
+)
+from vllm.sampling_params import SamplingParams
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.request import Request
+
+from .test_scheduler import create_scheduler_with_priority
+from .utils import EOS_TOKEN_ID
+
+pytestmark = pytest.mark.cpu_test
+
+
+def _create_random_request(
+    max_tokens_range: tuple[int, int],
+    num_tokens_range: tuple[int, int],
+    arrival_time_range: tuple[float, float],
+    priority_range: tuple[int, int],
+    num_mm_item_range: tuple[int, int],
+    vllm_config: VllmConfig,
+):
+    max_tokens = random.randint(*max_tokens_range)
+    num_tokens = random.randint(*num_tokens_range)
+    priority = random.randint(*priority_range)
+    arrival_time = random.uniform(*arrival_time_range)
+    num_mm_item = random.randint(*num_mm_item_range)
+
+    mm_positions: list[PlaceholderRange] = []
+    for mm_start in sorted(
+        random.sample(range(num_tokens), min(num_mm_item, num_tokens))
+    ):
+        if mm_start + 10 > num_tokens:
+            continue
+        mm_positions.append(PlaceholderRange(offset=mm_start, length=10))
+
+    request_id = uuid.uuid4().hex
+
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=max_tokens,
+    )
+    mm_features = []
+    for j, position in enumerate(mm_positions):
+        identifier = f"{request_id}_hash_{j}"
+        mm_feature = MultiModalFeatureSpec(
+            data=MultiModalKwargsItem.dummy("dummy_m"),
+            mm_position=position,
+            identifier=identifier,
+            modality="image",
+        )
+        mm_features.append(mm_feature)
+
+    prompt_token_ids = random.choices(range(100), k=num_tokens)
+
+    caching_hash_fn = get_hash_fn_by_name(
+        vllm_config.cache_config.prefix_caching_hash_algo
+    )
+    init_none_hash(caching_hash_fn)
+    block_hasher = get_request_block_hasher(
+        vllm_config.cache_config.block_size, caching_hash_fn
+    )
+
+    request = Request(
+        request_id=request_id,
+        prompt_token_ids=prompt_token_ids,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        mm_features=mm_features if mm_features else None,
+        eos_token_id=EOS_TOKEN_ID,
+        arrival_time=arrival_time,
+        priority=priority,
+        block_hasher=block_hasher,
+    )
+    return request
+
+
+def _mock_execute_model(
+    scheduler_output: SchedulerOutput, num_output_tokens_range: tuple[int, int]
+) -> ModelRunnerOutput:
+    request_ids: list[str] = []
+    request_ids.extend(req.req_id for req in scheduler_output.scheduled_new_reqs)
+    request_ids.extend(scheduler_output.scheduled_cached_reqs.req_ids)
+    random.shuffle(request_ids)
+
+    num_output_tokens = [
+        random.randint(*num_output_tokens_range) for _ in range(len(request_ids))
+    ]
+    sampled_token_ids = [
+        np.random.randint(0, 100, size=num_tokens) for num_tokens in num_output_tokens
+    ]
+
+    return ModelRunnerOutput(
+        req_ids=request_ids,
+        req_id_to_index={req_id: i for i, req_id in enumerate(request_ids)},
+        sampled_token_ids=sampled_token_ids,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+
+def _mock_draft_token_ids(
+    scheduler_output: SchedulerOutput,
+    num_output_tokens_range: tuple[int, int],
+    seen_request_prompt_length: dict[str, int],
+) -> DraftTokenIds:
+    request_ids: list[str] = []
+    sampled_token_ids: list[list[int]] = []
+    for request in scheduler_output.scheduled_new_reqs:
+        assert request.req_id not in seen_request_prompt_length
+        seen_request_prompt_length[request.req_id] = len(request.prompt_token_ids or [])
+        if request.num_computed_tokens >= seen_request_prompt_length[request.req_id]:
+            num_tokens = random.randint(*num_output_tokens_range)
+            request_ids.append(request.req_id)
+            sampled_token_ids.append(
+                [random.randint(0, 100) for _ in range(num_tokens)]
+            )
+    for req_id, num_computed_tokens in zip(
+        scheduler_output.scheduled_cached_reqs.req_ids,
+        scheduler_output.scheduled_cached_reqs.num_computed_tokens,
+    ):
+        if num_computed_tokens >= seen_request_prompt_length[req_id]:
+            num_tokens = random.randint(*num_output_tokens_range)
+            request_ids.append(req_id)
+            sampled_token_ids.append(
+                [random.randint(0, 100) for _ in range(num_tokens)]
+            )
+    return DraftTokenIds(req_ids=request_ids, draft_token_ids=sampled_token_ids)
+
+
+def _chech_valid_scheduler_output(
+    scheduler_output: SchedulerOutput,
+    seen_request_ids: set[str],
+    seen_mm_hashes: set[str],
+):
+    for req in scheduler_output.scheduled_new_reqs:
+        assert req.req_id not in seen_request_ids
+        seen_request_ids.add(req.req_id)
+    for req_id in scheduler_output.scheduled_cached_reqs.req_ids:
+        assert req_id in seen_request_ids
+
+    req_ids = set[str]()
+    req_ids.update(req.req_id for req in scheduler_output.scheduled_new_reqs)
+    req_ids.update(scheduler_output.scheduled_cached_reqs.req_ids)
+
+    assert set(scheduler_output.num_scheduled_tokens.keys()) == req_ids
+    assert (
+        sum(scheduler_output.num_scheduled_tokens.values())
+        == scheduler_output.total_num_scheduled_tokens
+    )
+
+    assert set(scheduler_output.scheduled_spec_decode_tokens.keys()) <= req_ids
+    assert set(scheduler_output.scheduled_encoder_inputs.keys()) <= req_ids
+
+    for req in scheduler_output.scheduled_new_reqs:
+        for mm_feature in req.mm_features:
+            seen_mm_hashes.add(mm_feature.identifier)
+    for mm_hash in scheduler_output.free_encoder_mm_hashes:
+        assert mm_hash in seen_mm_hashes
+
+    assert scheduler_output.finished_req_ids <= seen_request_ids
+
+
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+@pytest.mark.parametrize("num_speculative_tokens", [None, 1, 5])
+@pytest.mark.parametrize(
+    ("max_input_tokens", "max_output_tokens", "max_num_seqs", "num_blocks"),
+    [
+        # Standard profile
+        (5000, 500, 256, 10000),
+        # Generation heavy + high max_num_seqs + low num_blocks -> Many preemptions
+        (500, 5000, 1024, 1000),
+    ],
+    ids=["standard", "preemption"],
+)
+def test_priority_scheduling_blast(
+    enable_prefix_caching: bool,
+    num_speculative_tokens: int | None,
+    max_input_tokens: int,
+    max_output_tokens: int,
+    max_num_seqs: int,
+    num_blocks: int,
+):
+    random.seed(42)
+    np.random.seed(42)
+
+    seen_request_prompt_length = dict[str, int]()
+    seen_request_ids = set[str]()
+    seen_mm_hashes = set[str]()
+
+    scheduler = create_scheduler_with_priority(
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
+        max_num_seqs=max_num_seqs,
+        enable_prefix_caching=enable_prefix_caching,
+        num_blocks=num_blocks,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+
+    num_initial_requests = 10
+    for _ in range(num_initial_requests):
+        req = _create_random_request(
+            max_tokens_range=(1, max_output_tokens),
+            num_tokens_range=(1, max_input_tokens),
+            arrival_time_range=(0, 1),
+            priority_range=(-3, 3),
+            num_mm_item_range=(0, 2),
+            vllm_config=scheduler.vllm_config,
+        )
+        scheduler.add_request(req)
+
+    for _ in range(20000):
+        if len(scheduler.waiting) == 0:
+            num_new_requests = random.randint(0, 2)
+            for _ in range(num_new_requests):
+                req = _create_random_request(
+                    max_tokens_range=(1, max_output_tokens),
+                    num_tokens_range=(1, max_input_tokens),
+                    arrival_time_range=(0, 1),
+                    priority_range=(-3, 3),
+                    num_mm_item_range=(0, 2),
+                    vllm_config=scheduler.vllm_config,
+                )
+                scheduler.add_request(req)
+        scheduler_output = scheduler.schedule()
+        _chech_valid_scheduler_output(
+            scheduler_output, seen_request_ids, seen_mm_hashes
+        )
+        model_output = _mock_execute_model(
+            scheduler_output,
+            num_output_tokens_range=(1, 1 + (num_speculative_tokens or 0)),
+        )
+        scheduler.update_from_output(scheduler_output, model_output)
+        if num_speculative_tokens is not None:
+            scheduler.update_draft_token_ids(
+                _mock_draft_token_ids(
+                    scheduler_output,
+                    (0, num_speculative_tokens),
+                    seen_request_prompt_length,
+                )
+            )
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 749cf7dc8397..0570c0854c67 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -3,11 +3,13 @@
 import dataclasses
 from unittest.mock import Mock
 
+import numpy as np
 import pytest
 import torch
 
 from vllm.config import (
     CacheConfig,
+    ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
     SchedulerConfig,
@@ -20,6 +22,9 @@
     PlaceholderRange,
 )
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.utils.hashing import sha256
+from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (
@@ -27,11 +32,11 @@
     KVCacheConfig,
     KVCacheGroupSpec,
 )
-from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
-from .utils import EOS_TOKEN_ID, create_requests, create_scheduler
+from .utils import EOS_TOKEN_ID, create_requests, create_scheduler, mock_kv
 
 pytestmark = pytest.mark.cpu_test
 
@@ -165,7 +170,7 @@ def test_schedule_partial_requests():
         req_id_to_index=req_to_index,
         # Only the first request has a sampled token id because
         # the rest requests are still being prefilled.
-        sampled_token_ids=[[0], [], []],
+        sampled_token_ids=[np.array([0]), np.array([]), np.array([])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -212,7 +217,7 @@ def test_no_mm_input_chunking():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -272,7 +277,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -296,7 +301,8 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        sampled_token_ids=[np.array([0]), np.array([0])]
+        + [np.array([]) for _ in range(len(requests) - 2)],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -343,8 +349,8 @@ def test_stop_via_update_from_output():
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
         sampled_token_ids=[
-            [EOS_TOKEN_ID],
-            [10, 11],
+            np.array([EOS_TOKEN_ID]),
+            np.array([10, 11]),
         ],  # First request hits EOS, second continues
         logprobs=None,
         prompt_logprobs_dict={},
@@ -388,7 +394,10 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[10, 42, 12], [13, 14]],  # First request hits stop token
+        sampled_token_ids=[
+            np.array([10, 42, 12]),
+            np.array([13, 14]),
+        ],  # First request hits stop token
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -432,7 +441,10 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[10, 11, 12], [13]],  # First request exceeds max_tokens
+        sampled_token_ids=[
+            np.array([10, 11, 12]),
+            np.array([13]),
+        ],  # First request exceeds max_tokens
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -471,7 +483,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+        sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -612,7 +624,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -629,7 +641,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -666,7 +678,7 @@ def test_preempt_during_execution():
     model_runner_output0 = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -683,7 +695,7 @@ def test_preempt_during_execution():
     model_runner_output1 = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[[42]],
+        sampled_token_ids=[np.array([42])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -700,14 +712,18 @@ def test_preempt_during_execution():
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
-        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
-        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])),  # multiple sequences
-        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
-        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
+        ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        (
+            [[1, 2], [3]],
+            [np.array([1, 2, 5]), np.array([3, 4])],
+            (2, 3, 3, [2, 1]),
+        ),  # multiple sequences
+        ([[1]], [np.array([1, 2])], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [np.array([5])], (0, 0, 0, [0])),  # empty sequence
         (
             [[1, 2, 3], [4, 5, 6]],
-            [[1, 2, 7], [4, 8]],
+            [np.array([1, 2, 7]), np.array([4, 8])],
             (2, 6, 3, [2, 1, 0]),
         ),  # multiple mismatches
     ],
@@ -741,7 +757,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     model_runner_output = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([0]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -872,7 +888,10 @@ def _step_until_done(
         for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
             # We should be in the decode phase now.
             assert num_scheduled_tokens == 1
-        assert len(output.kv_connector_metadata.requests) == 0
+        if scheduler.connector is not None:
+            assert len(output.kv_connector_metadata.requests) == 0
+        if scheduler.ec_connector is not None:
+            assert len(output.ec_connector_metadata.mm_datas) == 0
         ecos = scheduler.update_from_output(output, model_runner_output)[0]
         all_done = True
         for eco in ecos.outputs:
@@ -881,27 +900,65 @@ def _step_until_done(
         all_finished = all_done
 
 
-def test_kv_connector_basic():
+def _step_until_kv_transfer_finished(scheduler: Scheduler, req_ids: list[str]):
+    """Cycle requests through a KV transfer cyle."""
+
+    # Requests should first transition to WAITING_FOR_REMOTE_KVS
+    output = scheduler.schedule()
+    assert len(scheduler.waiting) == len(req_ids)
+    assert len(scheduler.running) == 0
+    assert len(output.scheduled_new_reqs) == 0
+    for req in scheduler.requests.values():
+        assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+
+    # No model execution yet
+    EMPTY_OUTPUT = ModelRunnerOutput(
+        req_ids=[],
+        req_id_to_index={},
+        sampled_token_ids=[],
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, EMPTY_OUTPUT)
+
+    # Simulate KV transfer completion using KVConnectorOutput.finished_recving
+    output = scheduler.schedule()
+    assert len(scheduler.waiting) == len(req_ids)
+    assert len(scheduler.running) == 0
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=[],
+        req_id_to_index={},
+        sampled_token_ids=[],
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+        kv_connector_output=KVConnectorOutput(finished_recving=req_ids),
+    )
+    scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    for req_id in req_ids:
+        assert req_id in scheduler.finished_recving_kv_req_ids
+
+
+@pytest.mark.parametrize("is_async", [False, True])
+def test_kv_connector_basic(is_async: bool):
     """
     Test whether Scheduler with KVConnector schedules tokens, allocates
     memory, and cleans up requests as expected under normal operation.
     """
 
     # Setup Scheduler.
+    BLOCK_SIZE = 16
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(
+            matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
+        ),
+        block_size=BLOCK_SIZE,
     )
     NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
-    BLOCK_SIZE = scheduler.cache_config.block_size
-
-    # Mock External Cache Hit.
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
-    )
 
     ######################################################
     # FIRST SET OF REQUESTS - External Hit Only
@@ -921,10 +978,13 @@ def test_kv_connector_basic():
         req_ids.append(request.request_id)
         req_to_index[request.request_id] = i
 
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -971,10 +1031,13 @@ def test_kv_connector_basic():
         req_ids.append(request.request_id)
         req_to_index[request.request_id] = i
 
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1013,17 +1076,10 @@ def test_external_prefix_cache_metrics():
     """
 
     # Setup Scheduler.
+    NUM_MATCHED_NEW_TOKENS = 4
     scheduler = create_scheduler(
         enable_prefix_caching=False,
-        use_kv_connector=True,
-    )
-
-    # Mock connector to simulate a partial external cache hit
-    NUM_MATCHED_NEW_TOKENS = 4
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
     )
 
     # --- Prepare simple requests ---
@@ -1044,7 +1100,7 @@ def test_external_prefix_cache_metrics():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=[r.request_id for r in requests],
         req_id_to_index={r.request_id: i for i, r in enumerate(requests)},
-        sampled_token_ids=[[1000]] * NUM_REQUESTS,
+        sampled_token_ids=[np.array([1000])] * NUM_REQUESTS,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1066,7 +1122,10 @@ def test_external_prefix_cache_metrics():
     assert external_stats.preempted_requests == 0
 
 
-def test_kv_connector_unable_to_allocate():
+@pytest.mark.parametrize(
+    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
+)
+def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     """
     Test whether scheduler with KVConnector is able to handle
     unable to allocate (run out of blocks in allocate_slots().
@@ -1075,17 +1134,15 @@ def test_kv_connector_unable_to_allocate():
     # Setup Scheduler With Mock External Cache Hit.
     BLOCK_SIZE = 4
     NUM_BLOCKS = 10
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
-    )
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
+        # encoder connector should not affect test results
+        use_ec_connector=use_ec_connector,
+        ec_role=ec_role,
     )
 
     # Create two requests. The second request will not be able to
@@ -1109,7 +1166,7 @@ def test_kv_connector_unable_to_allocate():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1148,7 +1205,10 @@ def test_kv_connector_unable_to_allocate():
     assert len(scheduler.waiting) == 0
 
 
-def test_kv_connector_handles_preemption():
+@pytest.mark.parametrize(
+    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
+)
+def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     """
     Test whether scheduler with KVConnector is able to handle
     unable to allocate (run out of blocks in allocate_slots().
@@ -1158,18 +1218,15 @@ def test_kv_connector_handles_preemption():
     BLOCK_SIZE = 2
     # NOTE: there is 1 null block, so this is 6 blocks.
     NUM_BLOCKS = 7
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
-    )
-
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
+        # encoder connector should not affect test results
+        use_ec_connector=use_ec_connector,
+        ec_role=ec_role,
     )
 
     # Create two requests.
@@ -1194,7 +1251,7 @@ def test_kv_connector_handles_preemption():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1287,7 +1344,7 @@ def make_output(scheduler: Scheduler):
     return ModelRunnerOutput(
         req_ids=[req.request_id for req in scheduler.running],
         req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)},
-        sampled_token_ids=[[1000]] * len(scheduler.running),
+        sampled_token_ids=[np.array([1000])] * len(scheduler.running),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1379,6 +1436,8 @@ def create_scheduler_with_priority(
     block_size: int = 16,
     max_model_len: int | None = None,
     num_speculative_tokens: int | None = None,
+    use_ec_connector: bool = False,
+    ec_role: str | None = None,
 ) -> Scheduler:
     """Create scheduler with priority policy enabled.
 
@@ -1439,12 +1498,23 @@ def create_scheduler_with_priority(
             model="ngram", num_speculative_tokens=num_speculative_tokens
         )
 
+    ec_transfer_config = (
+        ECTransferConfig(
+            ec_connector="ECSharedStorageConnector",
+            ec_role=ec_role,
+            ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
+        )
+        if use_ec_connector
+        else None
+    )
+
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
+        ec_transfer_config=ec_transfer_config,
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
@@ -1465,16 +1535,23 @@ def create_scheduler_with_priority(
     )
 
 
+_none_hash_initialized = False
+
+
 def create_requests_with_priority(
     num_requests: int,
     priorities: list[int],
     arrival_times: list[float] | None = None,
     num_tokens: int = 10,
+    mm_hashes_list: list[list[str]] | None = None,
     mm_positions: list[list[PlaceholderRange]] | None = None,
     max_tokens: int = 16,
     stop_token_ids: list[int] | None = None,
     prompt_logprobs: int | None = None,
     starting_idx: int = 0,
+    same_prompt: bool = False,
+    block_size: int = 16,
+    req_ids: list[str] | None = None,
 ):
     """Create requests with specified priorities and arrival times."""
     assert len(priorities) == num_requests
@@ -1483,6 +1560,12 @@ def create_requests_with_priority(
     else:
         arrival_times = [float(i) for i in range(num_requests)]
 
+    global _none_hash_initialized
+    if not _none_hash_initialized:
+        init_none_hash(sha256)
+        _none_hash_initialized = True
+
+    block_hasher = get_request_block_hasher(block_size, sha256)
     sampling_params = SamplingParams(
         ignore_eos=False,
         max_tokens=max_tokens,
@@ -1490,29 +1573,70 @@ def create_requests_with_priority(
         prompt_logprobs=prompt_logprobs,
     )
     requests = []
+
+    if mm_hashes_list is not None:
+        # NOTE: allow manual input; some mm items can have the same identifier
+        # no. of mm_hashes and mm_positions for each request should be identical
+        assert mm_positions is not None, (
+            "mm_positions must be provided when mm_hashes_list is provided"
+        )
+        assert len(mm_hashes_list) == len(mm_positions) == num_requests
+        assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions]
+
+        # Since same identifier would imply they are identical encoder output
+        # Verify mm items with identical identifier are having mm_position.length
+        seen_hashes: dict[str, int] = {}
+
+    if req_ids:
+        assert len(req_ids) == num_requests
+    else:
+        req_ids = [f"{i + starting_idx}" for i in range(num_requests)]
+
     for i in range(num_requests):
         mm_features = []
-        if mm_positions is not None:
-            mm_position = mm_positions[i]
-            for j, position in enumerate(mm_position):
+
+        for j, position in enumerate(
+            mm_positions[i] if mm_positions is not None else []
+        ):
+            if mm_hashes_list is not None:
+                identifier = mm_hashes_list[i][j]
+
+                # Verify if position length is identical
+                position_length = position.length
+                if identifier in seen_hashes:
+                    assert seen_hashes[identifier] == position_length, (
+                        f"mm_hash '{identifier}' has inconsistent position lengths: "
+                        f"previously {seen_hashes[identifier]}, now {position_length} "
+                        f"at request {i}, position {j}"
+                    )
+                else:
+                    seen_hashes[identifier] = position_length
+            else:
+                # Unique dummy hash for each mm item
                 identifier = f"hash{i}_{j}"
-                mm_feature = MultiModalFeatureSpec(
-                    data=MultiModalKwargsItem.dummy("dummy_m"),
-                    mm_position=position,
-                    identifier=identifier,
-                    modality="image",
-                )
-                mm_features.append(mm_feature)
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image",
+            )
+            mm_features.append(mm_feature)
 
+        prompt_token_ids = (
+            [starting_idx] * num_tokens
+            if same_prompt
+            else [i + starting_idx] * num_tokens
+        )
         request = Request(
-            request_id=f"{i + starting_idx}",
-            prompt_token_ids=[i + starting_idx] * num_tokens,
+            request_id=req_ids[i],
+            prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
             mm_features=mm_features if mm_features else None,
             eos_token_id=EOS_TOKEN_ID,
             arrival_time=arrival_times[i],
             priority=priorities[i],
+            block_hasher=block_hasher,
         )
         requests.append(request)
     return requests
@@ -1637,7 +1761,7 @@ def test_priority_scheduling_preemption():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[[100] for _ in low_priority_requests],
+        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1706,7 +1830,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[[100] for _ in low_priority_requests],
+        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1952,7 +2076,7 @@ def test_priority_scheduling_heap_property():
             model_output = ModelRunnerOutput(
                 req_ids=[req.req_id],
                 req_id_to_index={req.req_id: 0},
-                sampled_token_ids=[[100]],
+                sampled_token_ids=[np.array([100])],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[],
@@ -1999,7 +2123,12 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     assert len(scheduler.waiting) == 1
 
 
-def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
+@pytest.mark.parametrize(
+    "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")]
+)
+def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
+    use_ec_connector, ec_role
+):
     """Test that priority scheduling preempts lower priority requests
     when out of KV cache space."""
     # Create scheduler with very limited memory to force preemption
@@ -2009,6 +2138,9 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
         num_blocks=5,  # Can hold 64 tokens (first block is null)
         block_size=16,  # Standard block size
         use_kv_connector=True,
+        # encoder connector should not affect test results
+        use_ec_connector=use_ec_connector,
+        ec_role=ec_role,
     )
 
     # Create a request and schedule it
@@ -2030,7 +2162,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[[100]],
+        sampled_token_ids=[np.array([100])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2061,7 +2193,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[100] for _ in requests],
+        sampled_token_ids=[np.array([100]) for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2087,7 +2219,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[], [100]],
+        sampled_token_ids=[np.array([]), np.array([100])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2136,6 +2268,8 @@ def test_chunked_prefill_disabled_for_encoder_decoder(
     scheduler_config = SchedulerConfig(
         enable_chunked_prefill=enable_chunked_prefill,
         is_encoder_decoder=is_encoder_decoder,
+        # Must <= max_num_batched_tokens if chunked prefill is disabled
+        max_model_len=SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
     )
 
     # `is_encoder_decoder` should only be used during construction
@@ -2160,7 +2294,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
 ) -> None:
     """Validate chunked prefill settings in the scheduler config for
     encoder-decoder models."""
-    assert scheduler_config.chunked_prefill_enabled is expect_enabled
     assert scheduler_config.enable_chunked_prefill is expect_enabled
     if is_encoder_decoder:
         # Encoder-decoder models should automatically disable chunked multimodal
@@ -2168,3 +2301,976 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
         assert scheduler_config.disable_chunked_mm_input is not expect_enabled
     if is_encoder_decoder and not expect_enabled:
         assert scheduler_config.long_prefill_token_threshold == 0
+
+
+# ==============================================================================
+# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests start
+# NOTE: In E->P->D disagg case, both KV and EC Connector works in P instance
+# Unless specify, the existence of KV Connector should not affect any test results
+# ==============================================================================
+
+
+def _assert_right_encoder_cache_allocated(
+    scheduler: Scheduler,
+    hashes_to_check: list[str] | None = None,
+    requests: list[Request] | None = None,
+    expected_total_allocated: int | None = None,
+):
+    """Check whether encoder cache is allocated correctly."""
+    encoder_cache_manager = scheduler.encoder_cache_manager
+
+    # Verify encoder cache manager exists
+    assert encoder_cache_manager is not None, "Encoder cache manager should exist"
+
+    # Verify number of cache
+    if expected_total_allocated is not None:
+        assert len(encoder_cache_manager.cached) == expected_total_allocated
+        if expected_total_allocated == 0:
+            return
+
+    # Verify each request with MM data is in cache
+    cached_hashes = set(encoder_cache_manager.cached.keys())
+
+    if hashes_to_check:
+        missed_hashes = set(hashes_to_check) - cached_hashes
+        assert not missed_hashes, (
+            f"Miss hashes: {missed_hashes} "
+            f"Existing encoder cache: {encoder_cache_manager.cached}"
+        )
+
+    for req in requests if requests is not None else []:
+        if req.mm_features:
+            mm_hashes = [f.identifier for f in req.mm_features]
+            req_hashes = set(mm_hashes)  # unique hashes set
+            missed_hashes = req_hashes - cached_hashes
+            assert not missed_hashes, (
+                f"Miss hashes in cache for request {req.request_id}: {missed_hashes} "
+                f"Existing encoder cache: {encoder_cache_manager.cached}"
+            )
+
+
+def _assert_right_ec_connector_metadata(
+    output: SchedulerOutput,
+    mm_features_list: list[MultiModalFeatureSpec],
+):
+    """Verify that ECConnector metadata EXACTLY matches the input MM data"""
+    # Get the connector metadata
+    metadata = output.ec_connector_metadata
+
+    # Create lookup dictionaries for efficient access
+    metadata_dict = {mm_data.mm_hash: mm_data for mm_data in metadata.mm_datas}
+
+    # Check all required identifiers exist in metadata; and no extra
+    # In ECSharedStorageConnector format
+    # NOTE: even having same identifier, the mm_features can be different
+    # since their mm_position can be in different offsets, etc
+    identifiers_dict = {f.identifier for f in mm_features_list}
+    assert set(metadata_dict.keys()) == identifiers_dict
+
+    # Verify the info matches
+    for i, mm_feature in enumerate(mm_features_list):
+        identifier = mm_feature.identifier
+        assert metadata_dict[identifier].mm_hash == identifier
+        assert metadata_dict[identifier].num_token == mm_feature.mm_position.length
+
+
+def _assert_right_encoder_inputs(
+    output: SchedulerOutput,
+    check_exist: bool | None = True,
+    requests: list[Request] | None = None,
+    expected_encoder_inputs: list[list[int]] | None = None,
+    expected_total_reqs: int | None = None,
+):
+    """Verify that requests/mm_hashes should (not) in scheduled encoder input
+    If check_exist is False, this function returns True
+    if requests are NOT in encoder inputs"""
+
+    # Get the scheduled encoder inputs
+    # NOTE: scheduled_encoder_inputs is a dictionary with request id as key
+    scheduled_encoder_inputs = output.scheduled_encoder_inputs
+
+    # Check if scheduled_encoder_inputs is empty as expected
+    if expected_total_reqs is not None:
+        assert len(scheduled_encoder_inputs) == expected_total_reqs
+        if expected_total_reqs == 0:
+            return
+
+    # Number of expected enocder inputs should match number of requests
+    if expected_encoder_inputs:
+        assert check_exist and requests is not None  # only support expect input exist
+        assert len(requests) == len(expected_encoder_inputs)
+
+    # Check request (not) exist as expected
+    for i, request in enumerate(requests if requests is not None else []):
+        assert (request.request_id in scheduled_encoder_inputs) is check_exist, (
+            f"Request {request.id} presence mismatch: expected {check_exist}, "
+            f"got {request.id in scheduled_encoder_inputs}"
+        )
+        if expected_encoder_inputs:
+            scheduled_encoder_input = scheduled_encoder_inputs[request.request_id]
+            assert scheduled_encoder_input == expected_encoder_inputs[i]
+
+
+def test_scheduler_no_ec_connector_by_default():
+    """Test scheduler doesn't have EC connector by default."""
+    scheduler = create_scheduler()
+    assert scheduler.ec_connector is None
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_text_only_request(use_kv_connector):
+    """Test text-only requests don't allocate encoder cache."""
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    NUM_PROMPT_TOKENS = 100
+
+    # Create text-only request (no mm_positions)
+    requests = create_requests(
+        num_requests=1,
+        num_tokens=NUM_PROMPT_TOKENS,
+    )
+    assert not requests[0].mm_features  # No MM data
+
+    scheduler.add_request(requests[0])
+    output = scheduler.schedule()
+
+    # Should schedule
+    assert len(output.scheduled_new_reqs) == 1
+
+    # Scheduled tokens should equal prompt tokens exactly
+    scheduled = output.num_scheduled_tokens[requests[0].request_id]
+    assert scheduled == NUM_PROMPT_TOKENS, (
+        f"Text-only should schedule {NUM_PROMPT_TOKENS}, got {scheduled}"
+    )
+
+    # Encoder cache should be empty
+    _assert_right_encoder_cache_allocated(scheduler, expected_total_allocated=0)
+
+    # ECConnector should carry no metadata
+    _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_cache_hit_external_load(use_kv_connector):
+    """Test ec_consumer loads from external cache when hit.
+    A normal basic operation for EPD disaggrgation"""
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        # kv connector should not effect test results
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Create MM request
+    NUM_TOKENS = 200  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS = 100
+    mm_hashes_list = [["hash_test1"]]
+    mm_positions = [[PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS)]]
+
+    request = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS,
+        mm_hashes_list=mm_hashes_list,
+        mm_positions=mm_positions,
+    )[0]
+
+    # Mock cache hit - encoder cache exists externally
+    scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request)
+    output = scheduler.schedule()
+
+    # Should schedule prompt tokens
+    scheduled_tokens = output.num_scheduled_tokens[request.request_id]
+    assert scheduled_tokens == NUM_TOKENS
+
+    # Should called update_state_after_alloc for external load
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request, 0)
+
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request])
+
+    # ECConnector should carry metadata of request
+    _assert_right_ec_connector_metadata(output, mm_features_list=request.mm_features)
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_cache_miss_computes_locally(use_kv_connector):
+    """Test consumer can compute encoder locally when cache miss (fallback)."""
+    # encoder cache itself if it doesn't receive it from external storage
+
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Verify consumer role
+    assert scheduler.ec_connector is not None
+    assert not scheduler.ec_connector.is_producer
+
+    # Create MM request
+    request_mm_missed = create_requests(
+        num_requests=1,
+        num_tokens=200,  # Total (including 100 MM)
+        mm_positions=[[PlaceholderRange(offset=0, length=100)]],  # 100 MM tokens
+    )[0]
+
+    # Mock cache miss - encoder cache doesn't exist externally
+    scheduler.ec_connector.has_caches = Mock(return_value=[False])
+
+    scheduler.add_request(request_mm_missed)
+    output = scheduler.schedule()
+
+    # SCHEDULER should decide to compute encoder locally (fallback)
+    assert len(output.scheduled_new_reqs) == 1
+
+    # Should schedule full prompt tokens
+    scheduled_tokens = output.num_scheduled_tokens[request_mm_missed.request_id]
+    assert scheduled_tokens == 200, (
+        f"Expected 200 tokens on cache miss, got {scheduled_tokens}"
+    )
+
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_mm_missed])
+
+    # ECConnector should carry no metadata (missed cache)
+    _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    # Scheduled encoder input contain mm for request_mm_missed
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request_mm_missed],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item
+        expected_total_reqs=1,
+    )
+
+    # Then MODEL_RUNNER will execute the encoder and cache the result
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
+    """Test consumer with partial cache hit (local & connector) with 2 requests."""
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Create MM request
+    NUM_TOKENS_1 = 300  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS_1 = 50
+    mm_hashes_list_1 = [["hash1_A", "hash1_B", "hash1_A", "hash1_F"]]
+    mm_positions_1 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_1),
+            PlaceholderRange(offset=100, length=NUM_ENCODER_TOKENS_1),
+            PlaceholderRange(offset=200, length=NUM_ENCODER_TOKENS_1),
+            PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_1),
+        ]
+    ]
+
+    # Create request with 4 MM items, with 2 identical items
+    request1 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_1,
+        mm_hashes_list=mm_hashes_list_1,
+        mm_positions=mm_positions_1,
+        max_tokens=1,  # For simplicity
+    )[0]
+
+    # Mock partial cache hit: 1st and 3rd missing, 2nd and 4th exist
+    scheduler.ec_connector.has_caches = Mock(return_value=[False, True, False, True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request1)
+    output = scheduler.schedule()
+
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request1.request_id]
+    assert scheduled_tokens == NUM_TOKENS_1
+
+    # Encoder cache should contain all mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request1])
+
+    # Should have called update_state_after_alloc for external load
+    scheduler.ec_connector.update_state_after_alloc.assert_called()
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for 2nd and 4th mm item
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request1.mm_features[1], request1.mm_features[3]]
+    )
+
+    # Should schedule ONLY 1 encoder input (index 0), no repeat for identical items
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request1],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item ONLY
+        expected_total_reqs=1,
+    )
+
+    # Simulate model execution 1 step
+    model_output = ModelRunnerOutput(
+        req_ids=[request1.request_id],
+        req_id_to_index={request1.request_id: 0},
+        sampled_token_ids=[np.array([100])],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # request1 is finished after outputing 1 token
+    # Finish request
+    scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED)
+
+    # Create another request with 4 MM items
+    NUM_TOKENS_2 = 400
+    NUM_ENCODER_TOKENS_2 = 50
+    mm_hashes_list_2 = [["hash1_C", "hash1_D", "hash1_E", "hash1_A"]]
+    mm_positions_2 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=100, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=200, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_2),
+        ]
+    ]
+
+    request2 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_2,
+        mm_hashes_list=mm_hashes_list_2,
+        mm_positions=mm_positions_2,
+        max_tokens=1,  # For simplicity
+    )[0]
+
+    # Mock partial cache hit: only hash1_A and hash1_C exist in connector
+    scheduler.ec_connector.has_caches = Mock(return_value=[True, False, False, True])
+
+    scheduler.add_request(request2)
+    output = scheduler.schedule()
+
+    # Check
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request2.request_id]
+    assert scheduled_tokens == 400
+
+    # Encoder cache should contain all mm items from request2
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request2])
+
+    # Should call update_state_after_alloc for hash1_C, ONLY
+    # hash1_A should not be loaded from connector
+    # since it's computed in last request & exist in local cache
+    # Order of getting encoder cache should be: local cache -> connector-> compute
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 0)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_once()
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for hash1_C only (index 0)
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[0]]
+    )
+
+    # Should schedule 2 encoder input hash1_D and hash1_E (index 1, 2)
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request2],
+        expected_encoder_inputs=[[1, 2]],
+        expected_total_reqs=1,
+    )
+
+
+@pytest.mark.parametrize("cache_exist", ["local", "connector_only", "no_where"])
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_seqs=10,  # allow multiple requests
+        max_num_batched_tokens=2048,
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+    mm_hashes_list = [[f"hash_{i}"] for i in range(10)]
+    mm_positions = [[PlaceholderRange(offset=i, length=100)] for i in range(10)]
+    requests = create_requests(
+        num_requests=10,
+        num_tokens=200,
+        mm_hashes_list=mm_hashes_list,
+        mm_positions=mm_positions,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Set up to test different encoder cache exsistence scenario after preemption
+    # Order of getting encoder cache should be: local cache -> connector-> compute
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    if cache_exist == "local":
+        # Allocate cache to cache manager manually to mimick
+        for req in requests:
+            scheduler.encoder_cache_manager.allocate(req, 0)
+    else:
+        # Make sure local encoder cache empty
+        scheduler.encoder_cache_manager.cached = {}
+
+    if cache_exist == "connector_only":
+        # Cache exist in ec_connector
+        scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    elif cache_exist == "no_where":
+        scheduler.ec_connector.has_caches = Mock(return_value=[False])
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.scheduled_cached_reqs.num_reqs == 0
+    assert len(output.finished_req_ids) == 0
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+
+    ## Encoder-cache-specific checks:
+    # mm_hashes of requests exist in cache after scheduling for all scenario
+    _assert_right_encoder_cache_allocated(scheduler, requests=requests)
+
+    # Should only call update_state_after_alloc when loaded externally
+    if cache_exist == "connector_only":
+        scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+            requests[-1], 0
+        )
+
+        # Concat mm_features for the 10 requests together
+        mm_features_list = [feature for req in requests for feature in req.mm_features]
+
+        # Check metadata should contain mm data for all 10 requests
+        _assert_right_ec_connector_metadata(output, mm_features_list=mm_features_list)
+    else:
+        scheduler.ec_connector.update_state_after_alloc.assert_not_called()
+        # ECConnector should carry no metadata
+        _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # Should only schedule encoder input when cache is not found anywhere
+    if cache_exist == "no_where":
+        _assert_right_encoder_inputs(
+            output,
+            requests=requests,
+            expected_encoder_inputs=[[0] for _ in range(10)],
+            expected_total_reqs=10,
+        )
+    else:
+        _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_unable_to_allocate(use_kv_connector):
+    """
+    Test whether scheduler with ECConnector is able to handle
+    unable to allocate (run out of blocks).
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 4
+    NUM_BLOCKS = 10
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Mock ec_connector load external cache behavior
+    scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    # Create two requests. The second request will not be able to
+    # allocate slots because it will not have enough blocks.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE
+    MAX_TOKENS = 2
+    requests = create_requests(
+        num_requests=NUM_REQUESTS,
+        num_tokens=NUM_TOKENS,
+        mm_hashes_list=[["hash_1"], ["hash_2"]],
+        mm_positions=[
+            [PlaceholderRange(offset=1, length=10)] for _ in range(NUM_REQUESTS)
+        ],
+        max_tokens=MAX_TOKENS,
+        block_size=BLOCK_SIZE,
+    )
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    # Setup MODEL_RUNNER_OUTPUT to be run in _step_until_done later
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    scheduled_tokens = output.num_scheduled_tokens[scheduler.running[0].request_id]
+    assert scheduled_tokens == NUM_TOKENS
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Should have called update_state_after_alloc for external load
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+        scheduler.running[0], 0
+    )
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # All memory should be freed, with one request waiting.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    scheduled_tokens = output.num_scheduled_tokens[scheduler.running[0].request_id]
+    assert scheduled_tokens == NUM_TOKENS
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # update_state_after_alloc should be called for loading external cache
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+        scheduler.running[0], 0
+    )
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # All memory should be freed, with no requests waiting / running.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 0
+
+
+@pytest.mark.parametrize("cache_exist", ["local", "connector_only", "no_where"])
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_priority_scheduling_ec_connector_preemption_and_resumption(
+    cache_exist, use_kv_connector
+):
+    """Test that priority scheduling preempts lower priority requests
+    when out of KV cache space."""
+    # Create scheduler with very limited memory to force preemption
+    scheduler = create_scheduler_with_priority(
+        model="llava-hf/llava-1.5-7b-hf",
+        enable_prefix_caching=True,
+        max_num_seqs=2,  # allow multiple requests
+        # kv connector should not effect test results
+        use_kv_connector=use_kv_connector,
+        num_blocks=15,  # can hold 244 tokens with 14 blocks (first block is null)
+        block_size=16,  # standard block size
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Mock cache hit: Both cache exist in connector (at E->PD initially)
+    scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    # Create a request and schedule it (and to be preempted)
+    request_low = create_requests_with_priority(
+        num_requests=1,
+        priorities=[1],
+        arrival_times=[0.0],
+        num_tokens=94,
+        mm_hashes_list=[["hash_low"]],
+        # NOTE: this test only preempt the last block.
+        # Setting mm_position at the last block can force to recompute encoding
+        mm_positions=[[PlaceholderRange(offset=82, length=10)]],
+        starting_idx=0,
+    )[0]
+    scheduler.add_request(request_low)
+    # 1st schedule
+    output = scheduler.schedule()
+
+    assert len(output.scheduled_new_reqs) == 1
+    scheduled_tokens = output.num_scheduled_tokens[request_low.request_id]
+    assert scheduled_tokens == 94
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    ## Encoder-cache-specific checks:
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_low])
+
+    # Verify update_state_after_alloc called (external load)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request_low, 0)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of request
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=request_low.mm_features
+    )
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+    # Simulate model execution - 1st decode
+    model_output = ModelRunnerOutput(
+        req_ids=[request_low.request_id],
+        req_id_to_index={request_low.request_id: 0},
+        sampled_token_ids=[np.array([100])],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Create a high priority request and schedule it
+    request_high = create_requests_with_priority(
+        num_requests=1,
+        priorities=[0],
+        arrival_times=[1.0],
+        num_tokens=128,
+        mm_hashes_list=[["hash_high"]],
+        mm_positions=[[PlaceholderRange(offset=1, length=10)]],
+        max_tokens=2,
+        starting_idx=1,
+    )[0]
+    scheduler.add_request(request_high)
+    # 2nd schedule
+    output = scheduler.schedule()
+
+    # KV cache should be full at this point
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == 0
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 2
+
+    ## Encoder-cache-specific checks:
+    # Encoder cache should contain mm items from request
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_high])
+
+    # Verify update_state_after_alloc called (external load)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request_high, 0)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of request
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=request_high.mm_features
+    )
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+    # Simulate model execution - 2nd decode
+    requests = [request_low, request_high]
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[np.array([100]) for _ in requests],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # 3rd schedule - - this should trigger preemption
+    # req_low needs 96 tokens = 6 blocks
+    # req_high needs 129 tokens = 9 blocks
+    # so doesn't fit in 14 blocks.
+    output = scheduler.schedule()
+
+    # Should have preempted req_low
+    assert len(output.scheduled_new_reqs) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
+    assert scheduler.requests[request_low.request_id].status == RequestStatus.PREEMPTED
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
+
+    ## Encoder-cache-specific checks:
+    # request_high is in decode phase now
+    # ECConnector should carry no metadata
+    _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    # Scheduled encoder input should be empty; no mm to compute
+    _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+    # Simulate model execution - 3rd decode, after req_low was preempted
+    requests = [request_low, request_high]
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[np.array([100]), np.array([100, 200])],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    # Finish the requests to make room for the preempted requests to resume
+    # req_high is finished after outputing 2 tokens
+    scheduler.update_from_output(output, model_output)
+    scheduler.finish_requests(
+        request_high.request_id, RequestStatus.FINISHED_LENGTH_CAPPED
+    )
+
+    # Set up to test different encoder cache exsistence scenario after preemption
+    # Order of getting encoder cache should be: local cache -> connector-> compute
+    # By default, the cache should still exist in local in this test case
+    if cache_exist != "local":
+        # Make local encoder cache empty
+        scheduler.encoder_cache_manager.cached = {}
+
+    if cache_exist == "connector_only":
+        # Cache exist in ec_connector
+        scheduler.ec_connector.has_caches = Mock(return_value=[True])
+    elif cache_exist == "no_where":
+        scheduler.ec_connector.has_caches = Mock(return_value=[False])
+
+    # 4th Schedule - this should trigger req_low resumption from waiting
+    output = scheduler.schedule()
+    scheduled_cached_reqs = output.scheduled_cached_reqs
+    resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption
+
+    assert len(output.scheduled_new_reqs) == 0
+    assert scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    # Preempted request resumed in scheduled_cached_reqs
+    assert len(resumed_from_preemption) == 1
+    assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1
+    assert resumed_from_preemption[0]
+    assert scheduled_cached_reqs.req_ids[0] == request_low.request_id
+    assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None
+    ## Resumed tokens include 94 prompt tokens and 2 decoded tokens
+    assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 96
+    assert scheduled_cached_reqs.resumed_req_token_ids[0][95] == 100
+    assert scheduler.running[0].request_id == request_low.request_id
+    assert request_high.request_id in output.finished_req_ids
+
+    ## Encoder-cache-specific checks:
+    # mm_hash of request_low exists in cache after scheduling for all scenario
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_low])
+
+    # Should only call update_state_after_alloc when loaded externally
+    if cache_exist == "connector_only":
+        scheduler.ec_connector.update_state_after_alloc.assert_called_with(
+            request_low, 0
+        )
+        _assert_right_ec_connector_metadata(
+            output, mm_features_list=request_low.mm_features
+        )
+    else:
+        scheduler.ec_connector.update_state_after_alloc.assert_not_called()
+        # ECConnector should carry no metadata
+        _assert_right_ec_connector_metadata(output, mm_features_list=[])
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # Should only schedule encoder input when cache is not found anywhere
+    if cache_exist == "no_where":
+        _assert_right_encoder_inputs(
+            output,
+            requests=[request_low],
+            expected_encoder_inputs=[[0]],
+            expected_total_reqs=1,
+        )
+    else:
+        _assert_right_encoder_inputs(output, expected_total_reqs=0)
+
+
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connector):
+    """
+    Scenario:
+      - Encoder cache size: 32
+      - Request A: 1 feature (12 tokens) → NOT cached remotely.
+      - Request B: 3 features (3 x 10 tokens) → ALL cached remotely.
+
+    Steps:
+      1. Schedule Request A (locally uses 12 tokens).
+      2. Schedule Request B (remote cache) - only schedule 1st and 2nd
+      3. Free A's cache, then schedule B again (continuation) - schedule 3rd image
+    """
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        block_size=16,
+        num_blocks=11,  # Can hold 160 tokens (first block is null)
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+    )
+
+    # Limit the number of availiable slots of EncoderCacheManager
+    scheduler.encoder_cache_manager = EncoderCacheManager(cache_size=32)
+
+    # Create MM request1
+    NUM_TOKENS_1 = 50  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS_1 = 12
+    mm_hashes_list_1 = [["hash1_1"]]
+    mm_positions_1 = [[PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_1)]]
+
+    request1 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_1,
+        mm_hashes_list=mm_hashes_list_1,
+        mm_positions=mm_positions_1,
+        max_tokens=1,  # For simplicity
+        req_ids=["req1"],
+    )[0]
+
+    # Create MM request1 with 3 MM items
+    NUM_TOKENS_2 = 40
+    NUM_ENCODER_TOKENS_2 = 10
+    mm_hashes_list_2 = [["hash2_1", "hash2_2", "hash2_3"]]
+    mm_positions_2 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=12, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=24, length=NUM_ENCODER_TOKENS_2),
+        ]
+    ]
+
+    request2 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_2,
+        mm_hashes_list=mm_hashes_list_2,
+        mm_positions=mm_positions_2,
+        max_tokens=10,
+        req_ids=["req2"],
+    )[0]
+
+    # Mock cache hit: MM of request1 NOT cached remotely, request2 cached remotely
+    scheduler.ec_connector.has_caches = Mock(
+        side_effect=lambda req: [True, True, True] if req == request2 else [False]
+    )
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request1)
+    scheduler.add_request(request2)
+    output = scheduler.schedule()
+
+    # Now, since encoder cache manager can only store 32 tokens
+    # It should allocated mm item hash1_1, hash2_1 and hash2_2
+    scheduled_tokens = output.num_scheduled_tokens[request1.request_id]
+    assert scheduled_tokens == NUM_TOKENS_1
+    assert scheduler.get_num_unfinished_requests() == 2
+
+    # Encoder cache should contain mm item from request1
+    _assert_right_encoder_cache_allocated(
+        scheduler, hashes_to_check=["hash1_1", "hash2_1", "hash2_2"]
+    )
+
+    # request2's 2nd mm item is the last call of update_state_after_alloc
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 1)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of hash2_1 and hash2_2 ONLY
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[0], request2.mm_features[1]]
+    )
+
+    # Should schedule ONLY 1 encoder input
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request1],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item of request1
+        expected_total_reqs=1,
+    )
+
+    # Simulate model execution 1 step
+    model_output = ModelRunnerOutput(
+        req_ids=[request1.request_id, request2.request_id],
+        req_id_to_index={request1.request_id: 0, request2.request_id: 1},
+        sampled_token_ids=[np.array([100]), np.array([121])],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # request1 is finished after outputing 1 token
+    # Finish request
+    scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED)
+    assert scheduler.get_num_unfinished_requests() == 1
+
+    # Schedule again; Now request1's encoder cache should be freed
+    # -> hash2_3 can be scheduled and allocated
+    output = scheduler.schedule()
+
+    # Check
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request2.request_id]
+    print(f"Hero: scheduled_tokens for req2: {scheduled_tokens}")
+    print(f"hero: num_scheduled_tokens 2: {output.num_scheduled_tokens}")
+
+    # Encoder cache should contain all mm items from request2
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request2])
+
+    # request2's 3rd mm item is the ONLY call of update_state_after_alloc
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 2)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_once()
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for hash2_3 ONLY
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[2]]
+    )
+
+    # Should schedule no encoder input
+    _assert_right_encoder_inputs(
+        output,
+        expected_total_reqs=0,
+    )
+
+
+# ==============================================================================
+# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests end
+# ==============================================================================
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 6e739d6b0e77..65511c17473b 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -3,8 +3,10 @@
 
 import torch
 
+from tests.v1.kv_connector.unit.utils import MockKVConfig
 from vllm.config import (
     CacheConfig,
+    ECTransferConfig,
     KVTransferConfig,
     ModelConfig,
     SchedulerConfig,
@@ -32,6 +34,10 @@
 EOS_TOKEN_ID = 50256
 
 
+def mock_kv(matched_tokens: int, is_async: bool):
+    return MockKVConfig(matched_tokens=matched_tokens, is_async=is_async)
+
+
 def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
@@ -39,13 +45,15 @@ def create_scheduler(
     enable_prefix_caching: bool | None = None,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
-    use_kv_connector: bool = False,
+    use_kv_connector: None | bool | MockKVConfig = None,
     num_blocks: int = 10000,
     block_size: int = 16,
     max_model_len: int | None = None,
     num_speculative_tokens: int | None = None,
     skip_tokenizer_init: bool = False,
     async_scheduling: bool = False,
+    use_ec_connector: bool = False,
+    ec_role: str | None = None,
 ) -> Scheduler | AsyncScheduler:
     """Create scheduler under test.
 
@@ -91,15 +99,22 @@ def create_scheduler(
         cache_dtype="auto",
         **kwargs_cache,
     )
-    kv_transfer_config = (
-        KVTransferConfig(
+    kv_transfer_config = None
+    if isinstance(use_kv_connector, MockKVConfig):
+        kv_transfer_config = KVTransferConfig(
+            kv_connector="MockKVConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={
+                "matched_tokens": use_kv_connector.matched_tokens,
+                "is_async": use_kv_connector.is_async,
+            },
+        )
+    elif use_kv_connector:
+        kv_transfer_config = KVTransferConfig(
             kv_connector="SharedStorageConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         )
-        if use_kv_connector
-        else None
-    )
 
     speculative_config: SpeculativeConfig | None = None
     if num_speculative_tokens is not None:
@@ -107,12 +122,23 @@ def create_scheduler(
             model="ngram", num_speculative_tokens=num_speculative_tokens
         )
 
+    ec_transfer_config = (
+        ECTransferConfig(
+            ec_connector="ECSharedStorageConnector",
+            ec_role=ec_role,
+            ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
+        )
+        if use_ec_connector
+        else None
+    )
+
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         speculative_config=speculative_config,
+        ec_transfer_config=ec_transfer_config,
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
@@ -140,12 +166,14 @@ def create_scheduler(
 def create_requests(
     num_requests: int,
     num_tokens: int = 10,
+    mm_hashes_list: list[list[str]] | None = None,
     mm_positions: list[list[PlaceholderRange]] | None = None,
     max_tokens: int = 16,
     stop_token_ids: list[int] | None = None,
     prompt_logprobs: int | None = None,
     same_prompt: bool = False,
     block_size: int = 16,
+    req_ids: list[str] | None = None,
 ) -> list[Request]:
     global _none_hash_initialized
     if not _none_hash_initialized:
@@ -160,25 +188,58 @@ def create_requests(
         prompt_logprobs=prompt_logprobs,
     )
     requests = []
+
+    if mm_hashes_list is not None:
+        # NOTE: allow manual input; some mm items can have the same identifier
+        # no. of mm_hashes and mm_positions for each request should be identical
+        assert mm_positions is not None, (
+            "mm_positions must be provided when mm_hashes_list is provided"
+        )
+        assert len(mm_hashes_list) == len(mm_positions) == num_requests
+        assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions]
+
+        # Since same identifier would imply they are identical encoder output
+        # Verify mm items with identical identifier are having mm_position.length
+        seen_hashes: dict[str, int] = {}
+
+    if req_ids:
+        assert len(req_ids) == num_requests
+    else:
+        req_ids = [f"{i}" for i in range(num_requests)]
+
     for i in range(num_requests):
         mm_features = []
-        if mm_positions is not None:
-            mm_position = mm_positions[i]
-            for j, position in enumerate(mm_position):
-                # Dummy hash for each mm item should be unique
-                # since encoder cache tracks entries by hash
+
+        for j, position in enumerate(
+            mm_positions[i] if mm_positions is not None else []
+        ):
+            if mm_hashes_list is not None:
+                identifier = mm_hashes_list[i][j]
+
+                # Verify if position length is identical
+                position_length = position.length
+                if identifier in seen_hashes:
+                    assert seen_hashes[identifier] == position_length, (
+                        f"mm_hash '{identifier}' has inconsistent position lengths: "
+                        f"previously {seen_hashes[identifier]}, now {position_length} "
+                        f"at request {i}, position {j}"
+                    )
+                else:
+                    seen_hashes[identifier] = position_length
+            else:
+                # Unique dummy hash for each mm item
                 identifier = f"hash{i}_{j}"
-                mm_feature = MultiModalFeatureSpec(
-                    data=MultiModalKwargsItem.dummy("dummy_m"),
-                    mm_position=position,
-                    identifier=identifier,
-                    modality="image",
-                )
-                mm_features.append(mm_feature)
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image",
+            )
+            mm_features.append(mm_feature)
 
         prompt_token_ids = [0] * num_tokens if same_prompt else [i] * num_tokens
         request = Request(
-            request_id=f"{i}",
+            request_id=req_ids[i],
             prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
diff --git a/tests/v1/determinism/conftest.py b/tests/v1/determinism/conftest.py
new file mode 100644
index 000000000000..3c2136e00584
--- /dev/null
+++ b/tests/v1/determinism/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch):
+    """Automatically enable batch invariant kernel overrides for all tests."""
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
+    yield
diff --git a/tests/v1/generation/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
similarity index 92%
rename from tests/v1/generation/test_batch_invariance.py
rename to tests/v1/determinism/test_batch_invariance.py
index 8fd038bca5d0..f018ee551dbf 100644
--- a/tests/v1/generation/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -6,66 +6,9 @@
 
 import pytest
 import torch
+from utils import _extract_step_logprobs, _random_prompt, skip_unsupported
 
 from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-skip_unsupported = pytest.mark.skipif(
-    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
-    reason="Requires CUDA and >= Hopper (SM90)",
-)
-
-
-@pytest.fixture(autouse=True)
-def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch):
-    """Automatically enable batch invariant kernel overrides for all tests."""
-    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
-    yield
-
-
-def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
-    # Generate more realistic prompts that will actually produce varied tokens
-    # Use a mix of common English text patterns
-
-    prompt_templates = [
-        # Question-answer style
-        "Question: What is the capital of France?\nAnswer: The capital of France is",
-        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
-        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
-        # Story/narrative style
-        "Once upon a time in a distant galaxy, there lived",
-        "The old man walked slowly down the street, remembering",
-        "In the year 2157, humanity finally discovered",
-        # Technical/code style
-        "To implement a binary search tree in Python, first we need to",
-        "The algorithm works by iterating through the array and",
-        "Here's how to optimize database queries using indexing:",
-        # Factual/informative style
-        "The Renaissance was a period in European history that",
-        "Climate change is caused by several factors including",
-        "The human brain contains approximately 86 billion neurons which",
-        # Conversational style
-        "I've been thinking about getting a new laptop because",
-        "Yesterday I went to the store and bought",
-        "My favorite thing about summer is definitely",
-    ]
-
-    # Pick a random template
-    base_prompt = random.choice(prompt_templates)
-
-    if max_words < min_words:
-        max_words = min_words
-    target_words = random.randint(min_words, max_words)
-
-    if target_words > 50:
-        # For longer prompts, repeat context
-        padding_text = (
-            " This is an interesting topic that deserves more explanation. "
-            * (target_words // 50)
-        )
-        base_prompt = base_prompt + padding_text
-
-    return base_prompt
 
 
 @skip_unsupported
@@ -204,22 +147,6 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
                 llm_bsN.shutdown()
 
 
-def _extract_step_logprobs(request_output):
-    if getattr(request_output, "outputs", None):
-        inner = request_output.outputs[0]
-        if hasattr(inner, "logprobs") and inner.logprobs is not None:
-            t = torch.tensor(
-                [
-                    inner.logprobs[i][tid].logprob
-                    for i, tid in enumerate(inner.token_ids)
-                ],
-                dtype=torch.float32,
-            )
-            return t, inner.token_ids
-
-    return None, None
-
-
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
new file mode 100644
index 000000000000..23f47863dd23
--- /dev/null
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+HTTP-based batch invariance test: send requests to a running
+vLLM server and compare BS=1 vs BS=N results (tokens and per-step logprobs).
+
+Environment variables:
+  - VLLM_TEST_MODEL: served model name (e.g., Qwen/Qwen3-1.7B / DeepSeek-R1)
+  - VLLM_TP_SIZE: tensor parallelism size (e.g., 4)
+
+"""
+
+import os
+import random
+import sys
+from typing import Any
+
+import openai
+from utils import _random_prompt, skip_unsupported
+
+from tests.utils import RemoteOpenAIServer
+
+
+def _request_completion(
+    client: openai.OpenAI,
+    model: str,
+    prompt: Any,
+    sp: dict[str, Any],
+    max_retries: int = 3,
+    retry_backoff: float = 0.5,
+) -> dict[str, Any] | None:
+    payload: dict[str, Any] = {"model": model, "prompt": prompt}
+    payload.update(sp)
+
+    for attempt in range(max_retries + 1):
+        try:
+            completion = client.completions.create(**payload)
+            # Convert to plain dict so downstream logic can keep using
+            # dict-style access just like with raw HTTP JSON.
+            return completion.model_dump()
+        except Exception as e:  # pragma: no cover
+            if attempt < max_retries:
+                import time as _t
+
+                _t.sleep(retry_backoff * (2**attempt))
+                continue
+            sys.stderr.write(f"Error: {e}\n")
+            return None
+    return None
+
+
+def _extract_tokens_and_logprobs(
+    choice: dict[str, Any],
+) -> tuple[list[Any], list[float] | None]:
+    tokens: list[Any] = []
+    token_logprobs: list[float] | None = None
+    lp = choice.get("logprobs")
+    if lp and isinstance(lp, dict):
+        tokens = lp.get("token_ids") or lp.get("tokens") or []
+        token_logprobs = lp.get("token_logprobs", None)
+    return tokens, token_logprobs
+
+
+def _compare_bs1_vs_bsn_single_process(
+    prompts: list[str],
+    sp_kwargs: dict[str, Any],
+    client: openai.OpenAI,
+    model_name: str,
+) -> None:
+    # BS=1
+    bs1_tokens_per_prompt: list[list[Any]] = []
+    bs1_logprobs_per_prompt: list[list[float] | None] = []
+    for p in prompts:
+        resp = _request_completion(client, model_name, p, sp_kwargs)
+        if resp is None or not resp.get("choices"):
+            raise AssertionError("BS=1 empty/failed response")
+        choice = resp["choices"][0]
+        toks, lps = _extract_tokens_and_logprobs(choice)
+        if lps is None:
+            raise AssertionError(
+                "logprobs not returned; ensure server supports 'logprobs'"
+            )
+        bs1_tokens_per_prompt.append(list(toks))
+        bs1_logprobs_per_prompt.append(list(lps))
+
+    # BS=N
+    bsN_tokens_per_prompt: list[list[Any]] = [None] * len(prompts)  # type: ignore[list-item]
+    bsN_logprobs_per_prompt: list[list[float] | None] = [None] * len(prompts)
+    resp = _request_completion(client, model_name, prompts, sp_kwargs)
+    if resp is None or not resp.get("choices"):
+        raise AssertionError("BS=N empty/failed batched response")
+    choices = resp.get("choices", [])
+    if len(choices) != len(prompts):
+        raise AssertionError(
+            f"BS=N choices length {len(choices)} != num prompts {len(prompts)}"
+        )
+    for idx, choice in enumerate(choices):
+        toks, lps = _extract_tokens_and_logprobs(choice)
+        if lps is None:
+            raise AssertionError(f"BS=N missing logprobs for prompt {idx}")
+        bsN_tokens_per_prompt[idx] = list(toks)
+        bsN_logprobs_per_prompt[idx] = list(lps)
+
+    # compare
+    for i, (tokens_bs1, tokens_bsN, logprobs_bs1, logprobs_bsN) in enumerate(
+        zip(
+            bs1_tokens_per_prompt,
+            bsN_tokens_per_prompt,
+            bs1_logprobs_per_prompt,
+            bsN_logprobs_per_prompt,
+        )
+    ):
+        if tokens_bs1 != tokens_bsN:
+            raise AssertionError(
+                f"Prompt {i} (sampling): Different tokens sampled. "
+                f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}"
+            )
+        if logprobs_bs1 is None or logprobs_bsN is None:
+            raise AssertionError(f"Prompt {i}: Missing logprobs in one of the runs")
+        if len(logprobs_bs1) != len(logprobs_bsN):
+            raise AssertionError(
+                f"Prompt {i}: Different number of steps: "
+                f"{len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)."
+            )
+        for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
+            if a != b:
+                diff = abs(a - b)
+                raise AssertionError(
+                    f"Prompt {i} Step {t}: Bitwise mismatch "
+                    f"(abs diff={diff:.6e}). "
+                    f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}"
+                )
+
+
+@skip_unsupported
+def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN():
+    random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
+    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    prompts_all = [_random_prompt(10, 50) for _ in range(32)]
+
+    sp_kwargs: dict[str, Any] = {
+        "temperature": 0.6,
+        "top_p": 1.0,
+        "max_tokens": 8,
+        "seed": 42,
+        "logprobs": 5,
+    }
+
+    tp_size = os.getenv("VLLM_TP_SIZE", "1")
+    server_args: list[str] = []
+    if tp_size:
+        server_args += ["-tp", tp_size]
+
+    with RemoteOpenAIServer(model_name, server_args) as server:
+        client = server.get_client()
+        _compare_bs1_vs_bsn_single_process(
+            prompts=prompts_all,
+            sp_kwargs=sp_kwargs,
+            client=client,
+            model_name=model_name,
+        )
diff --git a/tests/v1/generation/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py
similarity index 97%
rename from tests/v1/generation/test_rms_norm_batch_invariant.py
rename to tests/v1/determinism/test_rms_norm_batch_invariant.py
index f79eba58d6ef..390872519528 100644
--- a/tests/v1/generation/test_rms_norm_batch_invariant.py
+++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py
@@ -9,15 +9,10 @@
 
 import pytest
 import torch
+from utils import skip_unsupported
 
 from vllm.model_executor.layers.batch_invariant import rms_norm as triton_rms_norm
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.platforms import current_platform
-
-skip_unsupported = pytest.mark.skipif(
-    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
-    reason="Requires CUDA and >= Hopper (SM90)",
-)
 
 
 @skip_unsupported
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
new file mode 100644
index 000000000000..5141837faea0
--- /dev/null
+++ b/tests/v1/determinism/utils.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+skip_unsupported = pytest.mark.skipif(
+    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
+    reason="Requires CUDA and >= Hopper (SM90)",
+)
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    # Generate more realistic prompts that will actually produce varied tokens
+    # Use a mix of common English text patterns
+
+    prompt_templates = [
+        # Question-answer style
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        # Story/narrative style
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        # Technical/code style
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        # Factual/informative style
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        # Conversational style
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    # Pick a random template
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        # For longer prompts, repeat context
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def _extract_step_logprobs(request_output):
+    if getattr(request_output, "outputs", None):
+        inner = request_output.outputs[0]
+        if hasattr(inner, "logprobs") and inner.logprobs is not None:
+            t = torch.tensor(
+                [
+                    inner.logprobs[i][tid].logprob
+                    for i, tid in enumerate(inner.token_ids)
+                ],
+                dtype=torch.float32,
+            )
+            return t, inner.token_ids
+
+    return None, None
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 444afd5196dd..00d93e1ba0b5 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from itertools import repeat
 from typing import Any
 
 import pytest
@@ -8,126 +9,277 @@
 from vllm import SamplingParams
 from vllm.logprobs import Logprob
 from vllm.sampling_params import StructuredOutputsParams
+from vllm.v1.metrics.reader import Metric
 
 from ...conftest import VllmRunner
 from ...models.utils import check_outputs_equal
 
 MODEL = "Qwen/Qwen3-0.6B"
+MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 
 
-@dynamo_config.patch(cache_size_limit=16)
-def test_preempt_and_async_scheduling_e2e(
-    sample_json_schema, monkeypatch: pytest.MonkeyPatch
-):
-    """Test consistency of combos of async scheduling, preemption,
-    uni/multiproc executor, and various sampling parameters
-    including structured outputs."""
+first_prompt = (
+    "The following numbers of the sequence "
+    + ", ".join(str(i) for i in range(10))
+    + " are:"
+)
+example_prompts = [first_prompt, "In one word, the capital of France is "] + [
+    f"Tell me about the number {i}: " for i in range(32)
+]
 
-    first_prompt = (
-        "The following numbers of the sequence "
-        + ", ".join(str(i) for i in range(10))
-        + " are:"
-    )
-    example_prompts = [first_prompt, "In one word, the capital of France is "] + [
-        f"Tell me about the number {i}: " for i in range(32)
-    ]
+default_params = dict(
+    temperature=0.0,  # greedy
+    max_tokens=23,
+    min_tokens=18,
+)
 
-    sampling_param_tests: list[dict[str, Any]] = [
+
+def test_without_spec_decoding(
+    sample_json_schema,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test consistency of combos of async scheduling, preemption,
+    uni/multiproc executor, prefill chunking."""
+    struct_outputs = StructuredOutputsParams(json=sample_json_schema)
+    test_sampling_params: list[dict[str, Any]] = [
         dict(),
         # dict(min_tokens=20),
         dict(presence_penalty=-1.0),
         dict(bad_words=["the", " the"]),
         dict(logprobs=2),
         dict(logprobs=2, presence_penalty=-1.0),
-        dict(structured_outputs=StructuredOutputsParams(json=sample_json_schema)),
+        dict(structured_outputs=struct_outputs),
         dict(
-            structured_outputs=StructuredOutputsParams(json=sample_json_schema),
+            structured_outputs=struct_outputs,
             logprobs=2,
             presence_penalty=-1.0,
         ),
     ]
 
-    default_params = dict(
-        temperature=0.0,  # greedy
-        max_tokens=20,
-    )
+    # test_preemption, executor, async_scheduling,
+    # spec_config, test_prefill_chunking
+    test_configs = [
+        (False, "mp", False, None, False),
+        (True, "mp", False, None, True),
+        (False, "mp", True, None, False),
+        (False, "uni", True, None, False),
+        (True, "mp", True, None, False),
+        (True, "uni", True, None, False),
+        (False, "mp", True, None, True),
+        (True, "mp", True, None, True),
+        (True, "uni", True, None, True),
+    ]
+
+    run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
+
+
+def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
+    """Test consistency and acceptance rates with some different combos of
+    preemption, executor, async scheduling, prefill chunking,
+    spec decoding model length.
+    """
+
+    spec_config = {
+        "method": "eagle3",
+        "num_speculative_tokens": 2,
+        "model": "nm-testing/Llama3_2_1B_speculator.eagle3",
+    }
+    # Set small draft model len to force doesn't-fit-in-drafter case.
+    spec_config_short = spec_config | {"max_model_len": 50}
+
+    # test_preemption, executor, async_scheduling,
+    # spec_config, test_prefill_chunking
+    test_configs = [
+        (False, "mp", False, None, False),
+        (False, "mp", False, spec_config, False),
+        (True, "mp", False, spec_config, True),
+        (True, "uni", False, spec_config_short, True),
+        (False, "mp", True, spec_config, False),
+        (True, "mp", True, spec_config, False),
+        (False, "mp", True, spec_config_short, True),
+        (True, "uni", True, spec_config, False),
+        (True, "uni", True, spec_config_short, False),
+        (True, "mp", True, spec_config, True),
+        (True, "uni", True, spec_config_short, True),
+    ]
+
+    run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
+
+
+@dynamo_config.patch(cache_size_limit=16)
+def run_tests(
+    monkeypatch: pytest.MonkeyPatch,
+    model: str,
+    test_configs: list[tuple],
+    test_sampling_params: list[dict[str, Any]],
+):
+    """Test consistency of combos of async scheduling, preemption,
+    uni/multiproc executor with spec decoding."""
 
     with monkeypatch.context() as m:
+        # avoid precision errors
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
+        outputs: list[tuple[str, list, list]] = []
+        for n, (
+            test_preemption,
+            executor,
+            async_scheduling,
+            spec_config,
+            test_prefill_chunking,
+        ) in enumerate(test_configs, 1):
+            test_str = f"{n}/{len(test_configs)}"
+            test_results = run_test(
+                model,
+                test_str,
+                test_sampling_params,
+                test_preemption,
+                executor,
+                async_scheduling,
+                spec_config,
+                test_prefill_chunking=test_prefill_chunking,
+            )
+            outputs.append(test_results)
+
+    baseline_config, baseline_tests, _ = outputs[0]
+    _, _, baseline_acceptances = next(
+        (o for o in outputs if o[2] is not None), (None, None, None)
+    )
 
-        outputs: list[tuple[str, list]] = []
-        for test_preemption in [False, True]:
-            for executor in ["mp", "uni"]:
-                for async_scheduling in [False, True]:
-                    cache_arg: dict[str, Any] = (
-                        dict(num_gpu_blocks_override=32)
-                        if test_preemption
-                        else dict(gpu_memory_utilization=0.7)
-                    )
-                    test_config = (
-                        f"executor={executor}, preemption={test_preemption},"
-                        f" async_sched={async_scheduling}"
-                    )
-                    print("-" * 80)
-                    print(f"---- TESTING: {test_config}")
-                    print("-" * 80)
-                    with VllmRunner(
-                        MODEL,
-                        max_model_len=512,
-                        enforce_eager=True,
-                        async_scheduling=async_scheduling,
-                        distributed_executor_backend=executor,
-                        dtype="float32",  # avoid precision errors
-                        **cache_arg,
-                    ) as vllm_model:
-                        results = []
-                        for override_params in sampling_param_tests:
-                            print(f"----------- RUNNING PARAMS: {override_params}")
-                            results.append(
-                                vllm_model.generate(
-                                    example_prompts,
-                                    sampling_params=SamplingParams(
-                                        **default_params, **override_params
-                                    ),
-                                    return_logprobs=True,
-                                )
-                            )
-
-                        if not outputs:
-                            # First check that the different parameter configs
-                            # actually result in different output.
-                            for (other_test_outs, other_test_logprobs), params in zip(
-                                results[1:], sampling_param_tests[1:]
-                            ):
-                                with pytest.raises(AssertionError):
-                                    check_outputs_equal(
-                                        outputs_0_lst=results[0][0],
-                                        outputs_1_lst=other_test_outs,
-                                        name_0=f"baseline params={params}",
-                                        name_1=f"other params={params}",
-                                    )
-                                    assert _all_logprobs_match(
-                                        results[0][1], other_test_logprobs
-                                    )
-
-                        outputs.append((test_config, results))
-
-    baseline_config, baseline_tests = outputs[0]
-
-    for test_config, test_outputs in outputs[1:]:
-        for (base_outs, base_logprobs), (test_outs, test_logprobs), params in zip(
-            baseline_tests, test_outputs, sampling_param_tests
+    print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}")
+
+    failure = None
+    for test_config, test_outputs, test_acceptance_rates in outputs[1:]:
+        for (base_outs, base_logprobs), base_acceptance_rate, (
+            test_outs,
+            test_logprobs,
+        ), test_acceptance_rate, params in zip(
+            baseline_tests,
+            baseline_acceptances or repeat(None),
+            test_outputs,
+            test_acceptance_rates or repeat(None),
+            test_sampling_params,
         ):
-            check_outputs_equal(
-                outputs_0_lst=base_outs,
-                outputs_1_lst=test_outs,
-                name_0=f"baseline=[{baseline_config}], params={params}",
-                name_1=f"config=[{test_config}], params={params}",
+            try:
+                check_outputs_equal(
+                    outputs_0_lst=base_outs,
+                    outputs_1_lst=test_outs,
+                    name_0=f"baseline=[{baseline_config}], params={params}",
+                    name_1=f"config=[{test_config}], params={params}",
+                )
+                assert _all_logprobs_match(base_logprobs, test_logprobs)
+
+                if (
+                    base_acceptance_rate is not None
+                    and test_acceptance_rate is not None
+                ):
+                    if "spec_mml=None" in test_config:
+                        assert (
+                            test_acceptance_rate > base_acceptance_rate
+                            or test_acceptance_rate
+                            == pytest.approx(base_acceptance_rate, rel=5e-2)
+                        )
+                    else:
+                        # Currently the reported acceptance rate is expected to be
+                        # lower when we sometimes skip drafting altogether.
+                        assert test_acceptance_rate > 0.1
+                print(
+                    f"PASSED: config=[{test_config}], params={params}"
+                    f" accept_rate={test_acceptance_rate}"
+                )
+            except AssertionError as e:
+                print(
+                    f"FAILED: config=[{test_config}], params={params}"
+                    f" accept_rate={test_acceptance_rate}"
+                )
+                if failure is None:
+                    failure = e
+
+    if failure is not None:
+        raise failure
+
+
+def run_test(
+    model: str,
+    test_str: str,
+    sampling_param_tests: list[dict[str, Any]],
+    test_preemption: bool,
+    executor: str,
+    async_scheduling: bool,
+    spec_config: dict[str, Any] | None,
+    test_prefill_chunking: bool,
+):
+    spec_decoding = spec_config is not None
+    cache_arg: dict[str, Any] = (
+        # Force preemptions
+        dict(num_gpu_blocks_override=32)
+        if test_preemption
+        else dict(gpu_memory_utilization=0.9)
+    )
+    spec_mml = (spec_config or {}).get("max_model_len")
+    test_config = (
+        f"executor={executor}, preemption={test_preemption}, "
+        f"async_sched={async_scheduling}, "
+        f"chunk_prefill={test_prefill_chunking}, "
+        f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
+    )
+    print("-" * 80)
+    print(f"---- TESTING {test_str}: {test_config}")
+    print("-" * 80)
+    with VllmRunner(
+        model,
+        max_model_len=512,
+        enable_chunked_prefill=test_prefill_chunking,
+        # Force prefill chunking
+        max_num_batched_tokens=48 if test_prefill_chunking else None,
+        # enforce_eager=True,
+        async_scheduling=async_scheduling,
+        distributed_executor_backend=executor,
+        dtype="float32",  # avoid precision errors
+        speculative_config=spec_config,
+        disable_log_stats=False,
+        **cache_arg,
+    ) as vllm_model:
+        results = []
+        acceptance_rates: list[float] | None = [] if spec_decoding else None
+        for override_params in sampling_param_tests:
+            metrics_before = vllm_model.llm.get_metrics()
+            print(f"----------- RUNNING PARAMS: {override_params}")
+            results.append(
+                vllm_model.generate(
+                    example_prompts,
+                    sampling_params=SamplingParams(**default_params, **override_params),
+                    return_logprobs=True,
+                )
             )
-            assert _all_logprobs_match(base_logprobs, test_logprobs)
+            metrics_after = vllm_model.llm.get_metrics()
+            if acceptance_rates is not None:
+                acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after)
+                acceptance_rates.append(acceptance_rate)
+                print(f"ACCEPTANCE RATE {acceptance_rate}")
+
+            if test_preemption:
+                preemptions = _get_count(
+                    metrics_before, metrics_after, "vllm:num_preemptions"
+                )
+                assert preemptions > 0, "preemption test had no preemptions"
+
+    if len(results) > 1:
+        # First check that the different parameter configs
+        # actually result in different output.
+        for (other_test_outs, other_test_logprobs), params in zip(
+            results[1:], sampling_param_tests[1:]
+        ):
+            with pytest.raises(AssertionError):
+                check_outputs_equal(
+                    outputs_0_lst=results[0][0],
+                    outputs_1_lst=other_test_outs,
+                    name_0=f"baseline params={params}",
+                    name_1=f"other params={params}",
+                )
+                assert _all_logprobs_match(results[0][1], other_test_logprobs)
 
-            print(f"PASSED: config=[{test_config}], params={params}")
+    return test_config, results, acceptance_rates
 
 
 def _all_logprobs_match(req_a, req_b) -> bool:
@@ -149,3 +301,15 @@ def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> boo
         and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6)
         for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
     )
+
+
+def _get_acceptance_rate(before: list[Metric], after: list[Metric]) -> float:
+    draft = _get_count(before, after, "vllm:spec_decode_num_draft_tokens")
+    accept = _get_count(before, after, "vllm:spec_decode_num_accepted_tokens")
+    return accept / draft if draft > 0 else 0.0
+
+
+def _get_count(before: list[Metric], after: list[Metric], name: str) -> int:
+    before_val = next(m.value for m in before if m.name == name)
+    after_val = next(m.value for m in after if m.name == name)
+    return after_val - before_val
diff --git a/tests/v1/e2e/test_context_length.py b/tests/v1/e2e/test_context_length.py
new file mode 100644
index 000000000000..0ac40bec35fe
--- /dev/null
+++ b/tests/v1/e2e/test_context_length.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for vLLM `vllm/v1/engine/processor.Processor._validate_model_input()`
+handling of maximum context length for decoder models.
+
+This test ensures:
+- A prompt that is one token shorter than the model's maximum context length
+  can be processed successfully when requesting one additional token.
+- A prompt that reaches the model's maximum context length throws a
+  `ValueError` when requesting at least one additional token.
+"""
+
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model, max_model_len", [("JackFram/llama-160m", 2048)])
+@pytest.mark.parametrize(
+    "prompt_len, max_tokens",
+    [
+        (2047, 1),  # prompt_len = max_model_len - 1 -> allowed
+        (2048, 1),  # prompt_len = max_model_len -> not allowed
+    ],
+)
+def test_decoder_max_context_length_validation(
+    model: str,
+    max_model_len: int,
+    vllm_runner: type[VllmRunner],
+    prompt_len: int,
+    max_tokens: int,
+) -> None:
+    """Check vLLM decoder model input validation for edge cases where
+    the prompt length is (almost) equal to the max model length."""
+
+    prompt_ids = [[43] * prompt_len]
+
+    with vllm_runner(
+        model_name=model,
+        tokenizer_name=model,
+        max_model_len=max_model_len,
+        max_num_seqs=1,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        if prompt_len + max_tokens <= max_model_len:
+            # Should succeed as constraints are met
+            vllm_model.generate_greedy(prompt_ids, max_tokens)
+        else:
+            # Should raise the ValueError defined in
+            # vllm/v1/engine/processor.Processor_validate_model_input()
+            expected_msg = (
+                f"The decoder prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than "
+                f"the maximum model length of {max_model_len}. "
+                "Make sure that `max_model_len` is no smaller than the number of "
+                "text tokens (prompt + requested output tokens)."
+            )
+            with pytest.raises(ValueError) as excinfo:
+                vllm_model.generate_greedy(prompt_ids, max_tokens)
+            assert expected_msg in str(excinfo.value)
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index f2c6d1c1fd1a..2778b0c5e567 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -4,13 +4,11 @@
 import random
 
 import pytest
-import torch
 
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode
-from vllm.distributed import cleanup_dist_env_and_memory
 
-from ...utils import fork_new_process_for_each_test
+from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts
 
 # global seed
 SEED = 42
@@ -45,28 +43,12 @@ def test_prompts():
     return prompts
 
 
-def cleanup(llm: LLM, compilation_config: CompilationConfig):
-    # hacky: below lines are required to free up memory for the next test
-    # when setting VLLM_ENABLE_V1_MULTIPROCESSING=0, del llm is not sufficient
-    # TODO(sarckk): when enforce_eager=False, memory is not freed:
-    # find out why and re-enable test for enforce_eager=False case
-    llm_engine = llm.llm_engine.engine_core.engine_core
-    model_runner = llm_engine.model_executor.driver_worker.worker.model_runner
-    del model_runner.model
-    del model_runner.kv_caches
-    del compilation_config.static_forward_context
-    compilation_config.static_forward_context = {}
-
-    del llm
-    torch.cuda.empty_cache()
-    cleanup_dist_env_and_memory()
-
-
 @fork_new_process_for_each_test
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill")
+@pytest.mark.parametrize("kv_sharing_fast_prefill", [False, True])
+@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_kv_sharing_fast_prefill(
     monkeypatch: pytest.MonkeyPatch,
+    kv_sharing_fast_prefill: bool,
     enforce_eager: bool,
     test_prompts: list[str],
 ):
@@ -79,36 +61,25 @@ def test_kv_sharing_fast_prefill(
         if not enforce_eager
         else CompilationMode.NONE,
     )
+    batch_size = 10
 
     with monkeypatch.context() as m:
         # Make scheduling deterministic for reproducibility
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
-        llm = LLM(
-            model="google/gemma-3n-E2B-it",
-            enforce_eager=enforce_eager,
-            compilation_config=compilation_config,
-            seed=SEED,
-        )
-        ref_responses = llm.generate(test_prompts, sampling_params)
-
-        cleanup(llm, compilation_config)
+        prompts, answer, indices = prep_prompts(batch_size)
 
         llm = LLM(
             model="google/gemma-3n-E2B-it",
             enforce_eager=enforce_eager,
             compilation_config=compilation_config,
             seed=SEED,
-            kv_sharing_fast_prefill=True,
+            kv_sharing_fast_prefill=kv_sharing_fast_prefill,
+        )
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(
+            indices,
+            answer,
+            [response.outputs[0].text for response in responses],
+            accept_rate=1.0,
         )
-        optimized_responses = llm.generate(test_prompts, sampling_params)
-
-        cleanup(llm, compilation_config)
-
-        misses = 0
-
-        for ref_response, optimized_response in zip(ref_responses, optimized_responses):
-            if ref_response.outputs[0].text != optimized_response.outputs[0].text:
-                misses += 1
-
-        assert misses == 0
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 4a6b84ae4817..03396270a31c 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -75,6 +75,14 @@ def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
 
 
+@pytest.fixture(autouse=True)
+def reset_torch_dynamo():
+    """Reset torch dynamo cache before each test"""
+    yield
+    # Cleanup after test
+    torch._dynamo.reset()
+
+
 @pytest.mark.parametrize(
     "speculative_config",
     [
@@ -272,7 +280,7 @@ def test_speculators_model_integration(
 
 
 @pytest.mark.parametrize(
-    ["model_setup", "mm_enabled", "chunked_prefill_enabled"],
+    ["model_setup", "mm_enabled", "enable_chunked_prefill"],
     [
         (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
         pytest.param(
@@ -358,7 +366,7 @@ def test_eagle_correctness(
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
     mm_enabled: bool,
-    chunked_prefill_enabled: bool,
+    enable_chunked_prefill: bool,
     attn_backend: str,
 ):
     if attn_backend == "TREE_ATTN":
@@ -396,9 +404,7 @@ def test_eagle_correctness(
 
         method, model_name, spec_model_name, tp_size = model_setup
         max_model_len = 2048
-        max_num_batched_tokens = max_model_len
-        if chunked_prefill_enabled:
-            max_num_batched_tokens = 128
+        max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
 
         ref_llm = LLM(
             model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
@@ -420,7 +426,7 @@ def test_eagle_correctness(
             },
             max_model_len=max_model_len,
             max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=chunked_prefill_enabled,
+            enable_chunked_prefill=enable_chunked_prefill,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md
new file mode 100644
index 000000000000..30426e055ade
--- /dev/null
+++ b/tests/v1/ec_connector/integration/README.md
@@ -0,0 +1,171 @@
+# EPD Correctness Test
+
+This test verifies that EPD (Encoder-Prefill-Decode) disaggregation produces identical outputs to a baseline single instance.
+
+## What It Tests
+
+- **Baseline**: Single vLLM instance serving a multimodal model
+- **EPD (1E+1PD)**: 1 Encoder + 1 Prefill-Decode instance
+- **Baseline (1P+1D)**: 1 Prefill + 1 Decode instance
+- **EPD (1E+1P+1D)**: 1 Encoder + 1 Prefill + 1 Decode instance
+
+The test ensures that disaggregated encoding produces **identical** outputs to the baseline.
+
+Note that currently PD disaggregation set up may give slightly different results from a single instance. Therefore, we need the result from 1P+1D as the baseline for 1E+1P+1D
+
+Please refer to [Disaggregated Encoder Feature](../../../docs/features/disagg_encoder.md) for the detailed explanation for the EPD features.
+
+## Files
+
+- `run_epd_correctness_test.sh` - Main test script (starts all instances and runs tests)
+- `test_epd_correctness.py` - Python test script (compares outputs)
+
+## Usage
+
+### Multimodal Prompts (Default)
+
+```bash
+cd vllm
+./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+```
+
+This runs the test with actual multimodal (image) prompts.
+
+### Text-Only Prompts
+
+```bash
+cd vllm
+USE_MM_PROMPTS=0 ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+```
+
+This runs a quick test with text-only prompts to verify the setup works.
+
+### Custom Configuration
+
+```bash
+# Use specific GPUs
+GPU_E=0 GPU_PD=1 GPU_P=1 GPU_D=2 bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+
+# Use specific ports
+ENDPOINT_PORT=10001 bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+
+# Use specific model
+MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+
+# Use specific storage path
+EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash ./tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+```
+
+## How It Works
+
+### Step 1: Baseline
+
+1. Start single vLLM instance on GPU
+2. Run test prompts (multimodal or text-only)
+3. Save outputs to `.vllm_epd_baseline.txt`
+4. Shutdown instance
+
+### Step 2: EPD (1E + 1PD)
+
+1. Clear encoder cache storage
+2. Start instances and proxy
+3. Run same test prompts
+4. Assert outputs match baseline exactly
+5. Shutdown instances
+
+### Step 3: EPD (1E + 1P + 1D)
+
+1. Clear encoder cache storage
+2. Start instances and proxy
+3. Run same test prompts
+4. Assert outputs match baseline exactly
+5. Shutdown instances
+
+## Test Scenarios
+
+### Multimodal Prompts (--use_mm_prompts)
+
+Tests encoder cache transfer:
+
+- Single image query
+- Multiple images in one request
+- Mixed image and text
+- Image with detailed questions
+
+### Text-Only Prompts (default)
+
+Quick sanity check:
+
+- Simple text queries
+- Text-only explanations
+- Verifies proxy routing works
+
+## Expected Behavior
+
+### ✅ Test Passes When
+
+- All disagg outputs match baseline outputs exactly
+- No errors during instance startup
+- Encoder cache is properly saved and loaded
+- Proxy correctly routes requests
+
+### ❌ Test Fails When
+
+- Outputs differ between baseline and disagg
+- Server startup fails
+- Encoder cache not found (should fallback to local execution)
+- Proxy routing errors
+
+## Notes
+
+- The test uses deterministic generation (`temperature=0.0`, `seed=42`)
+- Encoder cache should enable exact output reproduction
+- Test cleans up all instances and cache files after completion
+- Safe to run multiple times (idempotent)
+- We setup the PD disagg part with NixlConnector. Please read details about EPD in `examples/online_serving/disaggregated_encoder/README.md`
+
+## Requirements
+
+- Multiple GPUs (3 for 1E+1P+1D, 2 for 1E+1PD, 1 for baseline)
+    - 1E+1P+1D is runnable with 2 GPU by assign E and P on the same GPU now.
+- Multimodal model (e.g., Qwen2.5-VL-3B-Instruct)
+- Internet access (for accessing vllm test images)
+
+## Debugging
+
+### Check Logs
+
+Logs and baseline output are saved in `/tmp/` by default.
+Can be customized by changing the environment variables.
+
+### Check Encoder Cache
+
+```bash
+# Verify cache files are created
+ls -la $EC_SHARED_STORAGE_PATH/
+
+# Should see directories with mm_hash names
+# Each containing encoder_cache.safetensors
+```
+
+### Manual Testing
+
+Run individual components:
+
+```bash
+# Baseline only
+python test_epd_correctness.py \
+    --service_url http://localhost:8000 \
+    --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+    --mode baseline \
+    --baseline_file test_output.txt \
+    --use_mm_prompts
+
+# Disagg only (requires baseline output file!)
+python test_epd_correctness.py \
+    --service_url http://localhost:8000 \
+    --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+    --mode disagg \
+    --baseline_file test_output.txt \
+    --use_mm_prompts
+```
diff --git a/tests/v1/ec_connector/integration/hato.jpg b/tests/v1/ec_connector/integration/hato.jpg
new file mode 100644
index 000000000000..9c7e390e7d7f
Binary files /dev/null and b/tests/v1/ec_connector/integration/hato.jpg differ
diff --git a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
new file mode 100644
index 000000000000..55dd39c0a957
--- /dev/null
+++ b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
@@ -0,0 +1,476 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# EPD (Encoder-Prefill-Decode) Correctness Test
+# 
+# This script tests that EPD disaggregation produces the same outputs as baseline.
+# It runs:
+# 1. Baseline: Single vLLM instance
+# 2. EPD: 1E + 1PD setup
+# 3. Baseline for (E + P + D): 1P + 1D vLLM instances disagg
+# 4. EPD: 1E + 1P + 1D setup
+
+# For GPU usage
+
+# set -xe
+
+# Find the git repository root directory
+GIT_ROOT=$(git rev-parse --show-toplevel)
+
+# Model to test
+MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
+
+# Set 1 to use multimodal prompts; else to use text-only
+USE_MM_PROMPTS="${USE_MM_PROMPTS:-1}"
+MM_FLAG=""
+if [ $USE_MM_PROMPTS = "1" ]; then
+    MM_FLAG="--use_mm_prompts"
+fi
+
+# GPU configuration
+GPU_E="${GPU_E:-0}"
+GPU_P="${GPU_P:-1}"
+GPU_D="${GPU_D:-2}"
+GPU_SINGLE="${GPU_SINGLE:-$GPU_P}"
+GPU_PD="${GPU_PD:-$GPU_P}"
+
+# Port
+ENCODE_PORT="${ENCODE_PORT:-19534}"
+PREFILL_PORT="${PREFILL_PORT:-19535}"
+DECODE_PORT="${DECODE_PORT:-19536}"
+PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19537}"
+ENDPOINT_PORT="${ENDPOINT_PORT:-10001}"
+
+# Storage path for encoder cache
+EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache_test}"
+TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-600}"
+
+# Output file for baseline comparison and logs
+LOG_PATH="${LOG_PATH:-/tmp}"
+BASELINE_FILE="${BASELINE_FILE:-/tmp/vllm_baseline.txt}"
+BASELINE_PD_FILE="${BASELINE_PD_FILE:-/tmp/vllm_epd_baseline.txt}"
+
+mkdir -p $LOG_PATH
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
+
+# Wait for server to be ready
+wait_for_server() {
+    local port=$1
+    timeout "$TIMEOUT_SECONDS" bash -c "
+        until curl -s localhost:${port}/v1/chat/completions > /dev/null; do
+            sleep 1
+        done" && return 0 || return 1
+}
+
+# Cleanup function
+cleanup_instances() {
+    echo "Cleaning up any running vLLM instances..."
+    pkill -f "vllm serve" || true
+    pkill -f "disagg_epd_proxy.py" || true
+    sleep 2
+}
+
+# Function to run baseline (single instance)
+run_baseline() {
+    echo "================================"
+    echo "Running BASELINE (single instance)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    
+    local PORT=$ENDPOINT_PORT
+    
+    # Start baseline instance
+    echo "Starting baseline instance on GPU $GPU_SINGLE, port $PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_SINGLE" vllm serve "$MODEL" \
+        --port $PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        > $LOG_PATH/baseline.log 2>&1 &
+    
+    local BASELINE_PID=$!
+    
+    # Wait for baseline to start
+    echo "Waiting for baseline instance to start..."
+    wait_for_server $PORT
+
+    curl http://127.0.0.1:$PORT/v1/models
+    echo ""
+    
+    # Run test in baseline mode
+    echo "Running baseline..."
+
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PORT" \
+        --model_name "$MODEL" \
+        --mode baseline \
+        --baseline_file "$BASELINE_FILE" \
+        $MM_FLAG
+    
+    # Cleanup baseline
+    echo "Stopping baseline instance..."
+    kill $BASELINE_PID 2>/dev/null || true
+    sleep 2
+    cleanup_instances
+}
+
+# Function to run EPD with 1E + 1PD
+run_epd_1e_1pd() {
+    echo "================================"
+    echo "Running EPD (1E + 1PD)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    mkdir -p "$EC_SHARED_STORAGE_PATH"
+    
+    local ENCODE_PORT=$ENCODE_PORT
+    local PREFILL_DECODE_PORT=$PREFILL_DECODE_PORT
+    local PROXY_PORT=$ENDPOINT_PORT
+    
+    declare -a PIDS=()
+    
+    # Start encoder instance
+    echo "Starting encoder instance on GPU $GPU_E, port $ENCODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+        --port $ENCODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.01 \
+        --enable-request-id-headers \
+        --no-enable-prefix-caching \
+        --max-num-batched-tokens 114688 \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_producer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        > $LOG_PATH/1e1pd_encoder.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start prefill+decode instance
+    echo "Starting PD instance on GPU $GPU_PD, port $PREFILL_DECODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
+        --port $PREFILL_DECODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_consumer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        > $LOG_PATH/1e1pd_pd.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for instances to start
+    echo "Waiting for encoder instance..."
+    wait_for_server $ENCODE_PORT
+    echo "Waiting for PD instance..."
+    wait_for_server $PREFILL_DECODE_PORT
+
+    # Start proxy
+    echo "Starting EPD proxy on port $PROXY_PORT"
+    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+        --host "0.0.0.0" \
+        --port $PROXY_PORT \
+        --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+        --prefill-servers-urls "disable" \
+        --decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
+        > $LOG_PATH/1e1pd_proxy.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for proxy
+    echo "Waiting for proxy..."
+    wait_for_server $PROXY_PORT
+
+    curl http://127.0.0.1:$PROXY_PORT/v1/models
+    curl http://127.0.0.1:$PROXY_PORT/health
+    echo ""
+
+    echo "All EPD (1E+1PD) services are up!"
+    
+    # Run test in disagg mode
+    echo "Running EPD (1E+1PD) correctness test..."
+    
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PROXY_PORT" \
+        --model_name "$MODEL" \
+        --mode disagg \
+        --baseline_file "$BASELINE_FILE" \
+        $MM_FLAG
+    
+    # Cleanup
+    echo "✓✓ 1E+1PD Correctness Test finished"
+    echo "Stopping EPD (1E+1PD) instances..."
+    for pid in "${PIDS[@]}"; do
+        kill $pid 2>/dev/null || true
+    done
+    sleep 2
+    cleanup_instances
+}
+
+# Function to run baseline for 1E + 1P + 1D (PD disagg)
+run_baseline_1p_1d() {
+    echo "================================"
+    echo "Running PD BASELINE (1P + 1D)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    mkdir -p "$EC_SHARED_STORAGE_PATH"
+    
+    local PREFILL_PORT=$PREFILL_PORT
+    local DECODE_PORT=$DECODE_PORT
+    local PROXY_PORT=$ENDPOINT_PORT
+    
+    declare -a PIDS=()
+    
+    # Start prefill instance
+    echo "Starting prefill instance on GPU $GPU_P, port $PREFILL_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_P" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+    vllm serve "$MODEL" \
+        --port $PREFILL_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_producer"
+        }' \
+        > $LOG_PATH/1p1d_prefill.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start decode instance
+    echo "Starting decode instance on GPU $GPU_D, port $DECODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_D" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+    vllm serve "$MODEL" \
+        --port $DECODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_consumer"
+        }' \
+        > $LOG_PATH/1p1d_decode.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for instances to start
+    echo "Waiting for prefill instance..."
+    wait_for_server $PREFILL_PORT
+    echo "Waiting for decode instance..."
+    wait_for_server $DECODE_PORT
+    
+    # Start proxy
+    echo "Starting EPD proxy on port $PROXY_PORT"
+    python "${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
+        --host "0.0.0.0" \
+        --port $PROXY_PORT \
+        --prefiller-ports $PREFILL_PORT \
+        --decoder-ports $DECODE_PORT \
+        > $LOG_PATH/1p1d_proxy.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for proxy
+    echo "Waiting for proxy..."
+    wait_for_server $PROXY_PORT
+
+    curl http://127.0.0.1:$PROXY_PORT/healthcheck
+    echo ""
+
+    echo "All PD (1P+1D) services are up!"
+    
+    # Run test in baseline mode
+    echo "Running PD disagg baseline..."
+    
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PROXY_PORT" \
+        --model_name "$MODEL" \
+        --mode baseline_pd \
+        --baseline_file "$BASELINE_PD_FILE" \
+        $MM_FLAG
+    
+    # Cleanup
+    echo "Stopping PD (1P+1D) instances..."
+    for pid in "${PIDS[@]}"; do
+        kill $pid 2>/dev/null || true
+    done
+    sleep 2
+    cleanup_instances
+}
+
+# Function to run EPD with 1E + 1P + 1D
+run_epd_1e_1p_1d() {
+    echo "================================"
+    echo "Running EPD (1E + 1P + 1D)"
+    echo "================================"
+    
+    cleanup_instances
+    rm -rf "$EC_SHARED_STORAGE_PATH"
+    mkdir -p "$EC_SHARED_STORAGE_PATH"
+    
+    local ENCODE_PORT=$ENCODE_PORT
+    local PREFILL_PORT=$PREFILL_PORT
+    local DECODE_PORT=$DECODE_PORT
+    local PROXY_PORT=$ENDPOINT_PORT
+    
+    declare -a PIDS=()
+    
+    # Start encoder instance
+    echo "Starting encoder instance on GPU $GPU_E, port $ENCODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
+        --port $ENCODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.01 \
+        --enable-request-id-headers \
+        --no-enable-prefix-caching \
+        --max-num-batched-tokens 114688 \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_producer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        > $LOG_PATH/1e1p1d_encoder.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start prefill instance
+    echo "Starting prefill instance on GPU $GPU_P, port $PREFILL_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_P" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
+    vllm serve "$MODEL" \
+        --port $PREFILL_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --ec-transfer-config '{
+            "ec_connector": "ECSharedStorageConnector",
+            "ec_role": "ec_consumer",
+            "ec_connector_extra_config": {
+                "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
+            }
+        }' \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_producer"
+        }' \
+        > $LOG_PATH/1e1p1d_prefill.log 2>&1 &
+    PIDS+=($!)
+    
+    # Start decode instance
+    echo "Starting decode instance on GPU $GPU_D, port $DECODE_PORT"
+    CUDA_VISIBLE_DEVICES="$GPU_D" \
+    VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
+    vllm serve "$MODEL" \
+        --port $DECODE_PORT \
+        --enforce-eager \
+        --gpu-memory-utilization 0.7 \
+        --enable-request-id-headers \
+        --max-num-seqs 128 \
+        --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
+        --kv-transfer-config '{
+            "kv_connector": "NixlConnector",
+            "kv_role": "kv_consumer"
+        }' \
+        > $LOG_PATH/1e1p1d_decode.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for instances to start
+    echo "Waiting for encoder instance..."
+    wait_for_server $ENCODE_PORT
+    echo "Waiting for prefill instance..."
+    wait_for_server $PREFILL_PORT
+    echo "Waiting for decode instance..."
+    wait_for_server $DECODE_PORT
+    
+    # Start proxy
+    echo "Starting EPD proxy on port $PROXY_PORT"
+    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+        --host "0.0.0.0" \
+        --port $PROXY_PORT \
+        --encode-servers-urls "http://localhost:$ENCODE_PORT" \
+        --prefill-servers-urls "http://localhost:$PREFILL_PORT" \
+        --decode-servers-urls "http://localhost:$DECODE_PORT" \
+        > $LOG_PATH/1e1p1d_proxy.log 2>&1 &
+    PIDS+=($!)
+    
+    # Wait for proxy
+    echo "Waiting for proxy..."
+    wait_for_server $PROXY_PORT
+
+    curl http://127.0.0.1:$PROXY_PORT/v1/models
+    curl http://127.0.0.1:$PROXY_PORT/health
+    echo ""
+
+    echo "All EPD (1E+1P+1D) services are up!"
+    
+    # Run test in disagg mode
+    echo "Running EPD (1E+1P+1D) correctness test..."
+    
+    python "${GIT_ROOT}/tests/v1/ec_connector/integration/test_epd_correctness.py" \
+        --service_url "http://localhost:$PROXY_PORT" \
+        --model_name "$MODEL" \
+        --mode disagg \
+        --baseline_file "$BASELINE_PD_FILE" \
+        $MM_FLAG
+    
+    # Cleanup
+    echo "✓✓ 1E+1P+1D Correctness Test finished"
+    echo "Stopping EPD (1E+1P+1D) instances..."
+    for pid in "${PIDS[@]}"; do
+        kill $pid 2>/dev/null || true
+    done
+    sleep 2
+    cleanup_instances
+}
+
+# Main execution
+echo "================================"
+echo "EPD Correctness Test Suite"
+echo "Model: $MODEL"
+echo "================================"
+
+# Step 1: Run baseline
+run_baseline
+
+# Step 2: Test 1E + 1PD
+run_epd_1e_1pd
+
+# Step 3: Test baseline 1P + 1D
+run_baseline_1p_1d
+
+# Step 4: Test 1E + 1P + 1D
+run_epd_1e_1p_1d
+
+# Cleanup output file
+rm -f "$BASELINE_FILE"
+rm -f "$BASELINE_PD_FILE"
+
+echo "================================"
+echo "✓✓ All EPD correctness tests finished!"
+echo "================================"
diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py
new file mode 100644
index 000000000000..69c4c58e349b
--- /dev/null
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -0,0 +1,305 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+EPD Correctness Test
+
+Tests that EPD (Encoder-Prefill-Decode) disaggregation produces the same
+outputs as a baseline single instance.
+
+Usage:
+    # Baseline mode (saves outputs):
+    python test_epd_correctness.py \
+        --service_url http://localhost:8000 \
+        --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+        --mode baseline \
+        --baseline_file .vllm_epd_baseline.txt
+
+    # Disagg mode (compares outputs):
+    python test_epd_correctness.py \
+        --service_url http://localhost:8000 \
+        --model_name Qwen/Qwen2.5-VL-3B-Instruct \
+        --mode disagg \
+        --baseline_file .vllm_epd_baseline.txt
+"""
+
+import argparse
+import json
+import os
+import time
+
+import openai
+import requests
+
+from vllm.assets.image import ImageAsset
+from vllm.multimodal.utils import encode_image_base64
+
+MAX_OUTPUT_LEN = 256
+
+# Sample prompts with multimodal content
+image_1 = ImageAsset("stop_sign").pil_image.resize((1280, 720))
+image_2 = ImageAsset("cherry_blossom").pil_image.resize((1280, 720))
+
+image_local_path = f"{os.path.dirname(os.path.abspath(__file__))}/hato.jpg"
+
+SAMPLE_PROMPTS_MM: list[dict] = [
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image;base64,{encode_image_base64(image_1)}"
+                        },
+                    },
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            }
+        ],
+        "description": "Single image query",
+    },
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image;base64,{encode_image_base64(image_2)}"
+                        },
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"file://{image_local_path}"},
+                    },
+                    {"type": "text", "text": "Describe these 2 images in detail."},
+                ],
+            }
+        ],
+        "description": "2 images with detailed query",
+    },
+]
+
+# Text-only prompts for mixed testing
+SAMPLE_PROMPTS_TEXT: list[dict] = [
+    {
+        "messages": [{"role": "user", "content": "What is the capital of France?"}],
+        "description": "Simple text-only query",
+    },
+    {
+        "messages": [
+            {"role": "user", "content": "Explain quantum computing in simple terms."}
+        ],
+        "description": "Text-only explanation request",
+    },
+]
+
+
+def check_vllm_server(url: str, timeout=5, retries=10) -> bool:
+    """Check if the vLLM server is ready.
+
+    Args:
+        url: The URL to check (usually /health or /healthcheck endpoint)
+        timeout: Timeout in seconds for each request
+        retries: Number of retries if the server is not ready
+
+    Returns:
+        True if the server is ready, False otherwise
+    """
+    for attempt in range(retries):
+        try:
+            response = requests.get(url, timeout=timeout)
+            if response.status_code == 200:
+                print(f"Server is ready at {url}")
+                return True
+            else:
+                print(
+                    f"Attempt {attempt + 1}/{retries}: Server returned "
+                    f"status code {response.status_code}"
+                )
+        except requests.exceptions.RequestException as e:
+            print(f"Attempt {attempt + 1}/{retries}: Error connecting: {e}")
+        time.sleep(2)  # Wait before retrying
+    return False
+
+
+def run_chat_completion(
+    base_url: str,
+    model_name: str,
+    messages: list,
+    max_tokens: int = MAX_OUTPUT_LEN,
+) -> str:
+    """Run a chat completion request.
+
+    Args:
+        base_url: Base URL of the vLLM server
+        model_name: Name of the model
+        messages: Messages for chat completion
+        max_tokens: Maximum tokens to generate
+
+    Returns:
+        Generated text content
+    """
+    client = openai.OpenAI(api_key="EMPTY", base_url=base_url)
+
+    completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+    )
+
+    return completion.choices[0].message.content
+
+
+def main():
+    """Main test function."""
+    parser = argparse.ArgumentParser(
+        description="EPD correctness test - compare disagg vs baseline"
+    )
+
+    parser.add_argument(
+        "--service_url",
+        type=str,
+        required=True,
+        help="The vLLM service URL (e.g., http://localhost:8000)",
+    )
+
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+        help="Model name",
+    )
+
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="baseline",
+        choices=["baseline", "baseline_pd", "disagg"],
+        help="Mode: baseline/baseline_pd (saves outputs) or disagg (compares outputs)",
+    )
+
+    parser.add_argument(
+        "--baseline_file",
+        type=str,
+        default=".vllm_epd_baseline.txt",
+        help="File to save/load baseline outputs",
+    )
+
+    parser.add_argument(
+        "--use_mm_prompts",
+        action="store_true",
+        help="Use multimodal prompts (default: use text-only for quick testing)",
+    )
+
+    args = parser.parse_args()
+
+    print(f"Service URL: {args.service_url}")
+    print(f"Model: {args.model_name}")
+    print(f"Mode: {args.mode}")
+    print(f"Output file: {args.baseline_file}")
+    print(f"Use MM prompts: {args.use_mm_prompts}")
+
+    # Determine health check endpoint
+    if args.mode == "baseline":
+        health_check_url = f"{args.service_url}/health"
+    elif args.mode == "baseline_pd":
+        # Nixl toy proxy use /healthcheck
+        health_check_url = f"{args.service_url}/healthcheck"
+    else:
+        # Disagg EPD proxy uses /health
+        health_check_url = f"{args.service_url}/health"
+        if not os.path.exists(args.baseline_file):
+            raise ValueError(
+                f"In disagg mode, the output file {args.baseline_file} from "
+                "baseline does not exist. Run baseline mode first."
+            )
+
+    # Check if server is ready
+    if not check_vllm_server(health_check_url):
+        raise RuntimeError(f"vLLM server at {args.service_url} is not ready!")
+
+    # Select prompts to use
+    if args.use_mm_prompts:
+        test_prompts = SAMPLE_PROMPTS_MM
+        print("Using multimodal prompts")
+    else:
+        test_prompts = SAMPLE_PROMPTS_TEXT
+        print("Using text-only prompts for quick testing")
+
+    # Run completions
+    service_url = f"{args.service_url}/v1"
+    output_strs = {}
+
+    for i, prompt_data in enumerate(test_prompts):
+        print(
+            f"\nRunning prompt {i + 1}/{len(test_prompts)}: {
+                prompt_data['description']
+            }"
+        )
+
+        output_str = run_chat_completion(
+            base_url=service_url,
+            model_name=args.model_name,
+            messages=prompt_data["messages"],
+            max_tokens=MAX_OUTPUT_LEN,
+        )
+
+        # Use description as key for comparison
+        key = prompt_data["description"]
+        output_strs[key] = output_str
+        print(f"Output: {output_str}")
+
+    if args.mode in ("baseline", "baseline_pd"):
+        # Baseline mode: Save outputs
+        print(f"\nSaving baseline outputs to {args.baseline_file}")
+        try:
+            with open(args.baseline_file, "w") as json_file:
+                json.dump(output_strs, json_file, indent=4)
+            print("✅ Baseline outputs saved successfully")
+        except OSError as e:
+            print(f"Error writing to file: {e}")
+            raise
+    else:
+        # Disagg mode: Load and compare outputs
+        print(f"\nLoading baseline outputs from {args.baseline_file}")
+        baseline_outputs = None
+        try:
+            with open(args.baseline_file) as json_file:
+                baseline_outputs = json.load(json_file)
+        except OSError as e:
+            print(f"Error reading from file: {e}")
+            raise
+
+        # Verify outputs match
+        print("\nComparing disagg outputs with baseline...")
+        assert isinstance(baseline_outputs, dict), "Baseline outputs should be a dict"
+        assert len(baseline_outputs) == len(output_strs), (
+            f"Length mismatch: baseline has {len(baseline_outputs)}, "
+            f"disagg has {len(output_strs)}"
+        )
+
+        all_match = True
+        for key, baseline_output in baseline_outputs.items():
+            assert key in output_strs, f"{key} not in disagg outputs"
+
+            disagg_output = output_strs[key]
+            if baseline_output == disagg_output:
+                print(f"✅ {key}: MATCH")
+            else:
+                print(f"❌ {key}: MISMATCH")
+                print(f"  Baseline: {baseline_output}")
+                print(f"  Disagg:   {disagg_output}")
+                all_match = False
+
+        assert all_match, "❌❌Disagg outputs do not match baseline!❌❌"
+        if all_match:
+            print("\n✅ All outputs match! Test PASSED")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py b/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
new file mode 100644
index 000000000000..a58daa2628e2
--- /dev/null
+++ b/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py
@@ -0,0 +1,609 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for ECSharedStorageConnector.
+"""
+
+import os
+from unittest.mock import Mock, patch
+
+import pytest
+import safetensors
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorRole
+from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (
+    ECSharedStorageConnector,
+    ECSharedStorageConnectorMetadata,
+    MMMeta,
+)
+from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
+from vllm.v1.core.sched.output import SchedulerOutput
+
+
+# ------------------ Mock Classes ------------------ #
+class MockRequest:
+    def __init__(self, request_id, mm_hashes: list[str], token_counts: list[int]):
+        assert len(mm_hashes) == len(token_counts)
+        self.request_id = request_id
+        self._token_counts = token_counts
+        self.mm_features = []
+        for i, mm_hash in enumerate(mm_hashes):
+            feature = MultiModalFeatureSpec(
+                data=None,
+                modality="image",
+                identifier=mm_hash,
+                mm_position=PlaceholderRange(offset=0, length=self._token_counts[i]),
+            )
+            self.mm_features.append(feature)
+
+    def get_num_encoder_tokens(self, input_id: int) -> int:
+        assert input_id < len(self._token_counts)
+        return self._token_counts[input_id]
+
+
+@pytest.fixture
+def temp_storage(tmp_path):
+    """Fixture providing temporary storage path."""
+    return str(tmp_path)
+
+
+@pytest.fixture
+def mock_vllm_config_producer(temp_storage):
+    """Fixture providing mock VllmConfig for producer role."""
+    config = Mock(spec=VllmConfig)
+    config.ec_transfer_config = Mock()
+    config.ec_transfer_config.get_from_extra_config = Mock(return_value=temp_storage)
+    config.ec_transfer_config.is_ec_producer = True
+    return config
+
+
+@pytest.fixture
+def mock_vllm_config_consumer(temp_storage):
+    """Fixture providing mock VllmConfig for consumer role."""
+    config = Mock(spec=VllmConfig)
+    config.ec_transfer_config = Mock()
+    config.ec_transfer_config.get_from_extra_config = Mock(return_value=temp_storage)
+    config.ec_transfer_config.is_ec_producer = False
+    return config
+
+
+@pytest.fixture
+def mock_request_with_3_mm():
+    """Fixture providing mock Request with 3 multimodal items."""
+    request_id = "test_req_123"
+    mm_hashes = ["img_hash_1", "img_hash_2", "img_hash_3"]
+    token_counts = [100, 150, 200]
+
+    request = MockRequest(request_id, mm_hashes, token_counts)
+    return request
+
+
+# ------------------ Unit Tests ------------------ #
+class TestECSharedStorageConnectorBasics:
+    """Test basic EC connector functionality."""
+
+    def test_initialization_producer(self, mock_vllm_config_producer, temp_storage):
+        """Test connector initializes correctly as producer."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        assert connector.role == ECConnectorRole.SCHEDULER
+        assert connector.is_producer
+        assert connector._storage_path == temp_storage
+        assert connector._mm_datas_need_loads == {}
+
+    def test_initialization_consumer(self, mock_vllm_config_consumer, temp_storage):
+        """Test connector initializes correctly as consumer."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        assert connector.role == ECConnectorRole.WORKER
+        assert not connector.is_producer
+        assert connector._storage_path == temp_storage
+
+    def test_role_assignment(self, mock_vllm_config_producer):
+        """Test role is correctly assigned."""
+        scheduler_connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+        worker_connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        assert scheduler_connector.role == ECConnectorRole.SCHEDULER
+        assert worker_connector.role == ECConnectorRole.WORKER
+
+
+class TestCacheExistence:
+    """Test cache existence checking using has_caches() API."""
+
+    def test_has_caches_all_exist_3_items(
+        self,
+        mock_vllm_config_producer,
+        mock_vllm_config_consumer,
+        mock_request_with_3_mm,
+    ):
+        """Test has_caches returns True when all 3 caches exist."""
+        # Test for producer first
+        producer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Create cache files using save_caches (proper way)
+        encoder_cache: dict[str, torch.Tensor] = {}
+
+        for mm_feature in mock_request_with_3_mm.mm_features:
+            mm_hash = mm_feature.identifier
+            encoder_cache[mm_hash] = torch.randn(10, 768)
+            producer.save_caches(encoder_cache, mm_hash)
+
+        # Test using has_caches API
+        producer_result = producer.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(producer_result) == 3
+        assert all(producer_result), f"Expected all True, got {producer_result}"
+
+        # Also test consumer can check if cache exists
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Test using has_caches API
+        consumer_result = consumer.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(consumer_result) == 3
+        assert all(consumer_result), f"Expected all True, got {consumer_result}"
+
+    def test_has_caches_none_exist(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test has_caches returns False when no caches exist."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Test without creating any files
+        result = connector.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(result) == 3
+        assert not any(result), f"Expected all False, got {result}"
+
+    def test_has_caches_partial_exist(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test has_caches with some caches existing (1 of 3)."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Create only the second cache file
+        mm_hash_second = mock_request_with_3_mm.mm_features[1].identifier
+        encoder_cache = {mm_hash_second: torch.randn(10, 768)}
+        connector.save_caches(encoder_cache, mm_hash_second)
+
+        # Test
+        result = connector.has_caches(mock_request_with_3_mm)
+
+        # Assert
+        assert len(result) == 3
+        assert not result[0]  # First doesn't exist
+        assert result[1]  # Second exists
+        assert not result[2]  # Third doesn't exist
+
+
+class TestStateManagement:
+    """Test connector state management."""
+
+    def test_update_state_after_alloc_3_items(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test state update after allocation for 3 MM items."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Initial state should be empty
+        assert len(connector._mm_datas_need_loads) == 0
+
+        # Update state for all 3 items
+        for i in range(3):
+            connector.update_state_after_alloc(mock_request_with_3_mm, index=i)
+
+        # Check state updated for all 3
+        assert len(connector._mm_datas_need_loads) == 3
+        assert "img_hash_1" in connector._mm_datas_need_loads
+        assert "img_hash_2" in connector._mm_datas_need_loads
+        assert "img_hash_3" in connector._mm_datas_need_loads
+        assert connector._mm_datas_need_loads["img_hash_1"] == 100
+        assert connector._mm_datas_need_loads["img_hash_2"] == 150
+        assert connector._mm_datas_need_loads["img_hash_3"] == 200
+
+    def test_build_connector_meta_3_items(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test metadata building for 3 MM items."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Setup state for all 3 items
+        for i in range(3):
+            connector.update_state_after_alloc(mock_request_with_3_mm, index=i)
+
+        # Build metadata
+        scheduler_output = Mock(spec=SchedulerOutput)
+        metadata = connector.build_connector_meta(scheduler_output)
+
+        # Assert
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert len(metadata.mm_datas) == 3
+        assert metadata.mm_datas[0].mm_hash == "img_hash_1"
+        assert metadata.mm_datas[0].num_token == 100
+        assert metadata.mm_datas[1].mm_hash == "img_hash_2"
+        assert metadata.mm_datas[1].num_token == 150
+        assert metadata.mm_datas[2].mm_hash == "img_hash_3"
+        assert metadata.mm_datas[2].num_token == 200
+
+        # State should be cleared after building
+        assert len(connector._mm_datas_need_loads) == 0
+
+    def test_build_connector_meta_empty(self, mock_vllm_config_producer):
+        """Test metadata building with empty state."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        scheduler_output = Mock(spec=SchedulerOutput)
+        metadata = connector.build_connector_meta(scheduler_output)
+
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert len(metadata.mm_datas) == 0
+
+    def test_state_cleared_after_metadata_build(
+        self, mock_vllm_config_producer, mock_request_with_3_mm
+    ):
+        """Test that state is properly cleared after building metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        # Add state
+        for i in range(3):
+            connector.update_state_after_alloc(mock_request_with_3_mm, index=i)
+        assert len(connector._mm_datas_need_loads) == 3
+
+        # Build metadata (should clear state)
+        scheduler_output = Mock(spec=SchedulerOutput)
+        connector.build_connector_meta(scheduler_output)
+
+        # State should be empty
+        assert len(connector._mm_datas_need_loads) == 0
+
+        # Build again should return empty metadata
+        metadata2 = connector.build_connector_meta(scheduler_output)
+        assert len(metadata2.mm_datas) == 0
+
+
+class TestCacheSaving:
+    """Test encoder cache saving (producer only)."""
+
+    def test_save_caches_producer_3_items(
+        self, mock_vllm_config_producer, mock_request_with_3_mm, temp_storage
+    ):
+        """Test cache saving as producer for 3 different MM items."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Create and save 3 different caches
+        mm_hashes = [f.identifier for f in mock_request_with_3_mm.mm_features]
+        encoder_cache: dict[str, torch.Tensor] = {}
+
+        for mm_hash in mm_hashes:
+            encoder_cache[mm_hash] = torch.randn(10, 768)
+            connector.save_caches(encoder_cache, mm_hash)
+
+        # Verify all files exist using has_caches
+        result = connector.has_caches(mock_request_with_3_mm)
+        assert all(result), f"Not all caches were saved: {result}"
+
+        # Verify each file's content
+        for mm_hash in mm_hashes:
+            filename = connector._generate_filename_debug(mm_hash)
+            loaded = safetensors.torch.load_file(filename)
+            assert "ec_cache" in loaded
+            assert torch.allclose(loaded["ec_cache"], encoder_cache[mm_hash].cpu())
+
+    def test_save_caches_consumer_skips(self, mock_vllm_config_consumer):
+        """Test cache saving is skipped for consumer."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "test_hash_consumer"
+        encoder_cache = {mm_hash: torch.randn(10, 768)}
+
+        # Save should not raise but also not create file
+        connector.save_caches(encoder_cache, mm_hash)
+
+        # Verify file doesn't exist using has_caches
+        mock_request = MockRequest("req_consumer", [mm_hash], [10])
+        result = connector.has_caches(mock_request)
+        assert not result[0], "Consumer should not save caches"
+
+
+class TestCacheLoading:
+    """Test encoder cache loading (consumer)."""
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_start_load_caches_consumer_3_items(
+        self,
+        mock_vllm_config_producer,
+        mock_vllm_config_consumer,
+        mock_request_with_3_mm,
+        temp_storage,
+    ):
+        """Test consumer loads 3 caches from storage."""
+        # First, create producer to save caches
+        producer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Producer saves 3 caches
+        mm_hashes = [f.identifier for f in mock_request_with_3_mm.mm_features]
+        saved_caches = {}
+        for mm_hash in mm_hashes:
+            saved_caches[mm_hash] = torch.randn(10, 768)
+            producer.save_caches(saved_caches, mm_hash)
+
+        # Now consumer loads
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Setup metadata for all 3
+        metadata = ECSharedStorageConnectorMetadata()
+        for mm_hash in mm_hashes:
+            metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
+        consumer.bind_connector_metadata(metadata)
+
+        # Load
+        encoder_cache: dict[str, torch.Tensor] = {}
+        consumer.start_load_caches(encoder_cache=encoder_cache)
+
+        # Verify all 3 loaded
+        assert len(encoder_cache) == 3
+        for mm_hash in mm_hashes:
+            assert mm_hash in encoder_cache, f"{mm_hash} missing in encoder_cache"
+            assert encoder_cache[mm_hash].is_cuda, (
+                f"{mm_hash} cache is in {encoder_cache[mm_hash].device}"
+            )
+            assert torch.allclose(
+                encoder_cache[mm_hash].cpu(), saved_caches[mm_hash]
+            ), f"{mm_hash} cache saved and loaded tesnor are not the same"
+
+    def test_start_load_caches_skip_existing(
+        self, mock_vllm_config_producer, mock_vllm_config_consumer, temp_storage
+    ):
+        """Test cache loading skips already cached items."""
+        # Setup: producer saves cache
+        producer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "existing_hash"
+        saved_cache = torch.randn(10, 768)
+        producer.save_caches({mm_hash: saved_cache}, mm_hash)
+
+        # Consumer setup
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100))
+        consumer.bind_connector_metadata(metadata)
+
+        # Pre-populate encoder_cache with different value
+        existing_cache = torch.randn(5, 512)
+        encoder_cache = {mm_hash: existing_cache}
+
+        # Load (should skip since already exists)
+        with patch("safetensors.torch.load_file") as mock_load:
+            consumer.start_load_caches(encoder_cache=encoder_cache)
+            # Should not call load_file since cache exists
+            mock_load.assert_not_called()
+
+        # Verify original cache unchanged
+        assert torch.equal(encoder_cache[mm_hash], existing_cache)
+
+    def test_start_load_caches_empty_metadata(self, mock_vllm_config_consumer):
+        """Test loading with empty metadata does nothing."""
+        consumer = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        # Setup empty metadata
+        metadata = ECSharedStorageConnectorMetadata()
+        consumer.bind_connector_metadata(metadata)
+
+        # Load (should not raise)
+        encoder_cache: dict[str, torch.Tensor] = {}
+        consumer.start_load_caches(encoder_cache=encoder_cache)
+
+        # Cache should remain empty
+        assert len(encoder_cache) == 0
+
+
+class TestFilenameGeneration:
+    """Test filename and path generation."""
+
+    def test_generate_foldername(self, mock_vllm_config_producer, temp_storage):
+        """Test folder name generation."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "test_folder_hash"
+        folder = connector._generate_foldername_debug(mm_hash)
+
+        assert folder == os.path.join(temp_storage, mm_hash)
+        assert os.path.isdir(folder)  # Should be created
+
+    def test_generate_filename(self, mock_vllm_config_producer, temp_storage):
+        """Test filename generation."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "test_file_hash"
+        filename = connector._generate_filename_debug(mm_hash)
+
+        expected = os.path.join(temp_storage, mm_hash, "encoder_cache.safetensors")
+        assert filename == expected
+        assert os.path.isdir(os.path.dirname(filename))  # Folder created
+
+    def test_generate_filename_consistency(self, mock_vllm_config_producer):
+        """Test filename generation is consistent."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "consistency_hash"
+        filename1 = connector._generate_filename_debug(mm_hash)
+        filename2 = connector._generate_filename_debug(mm_hash)
+
+        assert filename1 == filename2
+
+
+class TestMetadataBindingLifecycle:
+    """Test metadata binding and clearing lifecycle."""
+
+    def test_bind_connector_metadata(self, mock_vllm_config_consumer):
+        """Test binding connector metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        metadata.add_mm_data(MMMeta.make_meta("hash_1", 100))
+
+        connector.bind_connector_metadata(metadata)
+
+        assert connector._connector_metadata is metadata
+
+    def test_clear_connector_metadata(self, mock_vllm_config_consumer):
+        """Test clearing connector metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        connector.bind_connector_metadata(metadata)
+
+        connector.clear_connector_metadata()
+
+        assert connector._connector_metadata is None
+
+    def test_get_connector_metadata(self, mock_vllm_config_consumer):
+        """Test getting connector metadata."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        connector.bind_connector_metadata(metadata)
+
+        retrieved = connector._get_connector_metadata()
+
+        assert retrieved is metadata
+
+    def test_get_connector_metadata_not_set(self, mock_vllm_config_consumer):
+        """Test getting metadata when not set raises."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        with pytest.raises(AssertionError):
+            connector._get_connector_metadata()
+
+
+class TestEdgeCases:
+    """Test edge cases and error handling."""
+
+    def test_save_empty_cache(self, mock_vllm_config_producer):
+        """Test saving empty tensor."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        mm_hash = "empty_hash"
+        encoder_cache = {mm_hash: torch.empty(0)}
+
+        # Should not raise
+        connector.save_caches(encoder_cache, mm_hash)
+
+    def test_load_nonexistent_cache(self, mock_vllm_config_consumer):
+        """Test loading cache that doesn't exist raises error."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_consumer,
+            role=ECConnectorRole.WORKER,
+        )
+
+        metadata = ECSharedStorageConnectorMetadata()
+        metadata.add_mm_data(MMMeta.make_meta("nonexistent_hash", 100))
+        connector.bind_connector_metadata(metadata)
+
+        encoder_cache: dict[str, torch.Tensor] = {}
+
+        # Should raise FileNotFoundError
+        with pytest.raises(FileNotFoundError):
+            connector.start_load_caches(encoder_cache=encoder_cache)
+
+    def test_has_caches_empty_request(self, mock_vllm_config_producer):
+        """Test has_caches with request that has no MM data."""
+        connector = ECSharedStorageConnector(
+            vllm_config=mock_vllm_config_producer,
+            role=ECConnectorRole.SCHEDULER,
+        )
+
+        mock_request = MockRequest("req_empty", [], [])
+
+        result = connector.has_caches(mock_request)
+
+        assert len(result) == 0
+        assert result == []
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 84441aa7d28c..3ba8ab26f552 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -10,6 +10,14 @@
 from transformers import AutoTokenizer
 
 from vllm import SamplingParams
+from vllm.config import (
+    CacheConfig,
+    ECTransferConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_default_torch_num_threads
@@ -450,3 +458,141 @@ def test_engine_core_invalid_request_id_type():
     engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
     assert len(engine_core.scheduler.waiting) == 1
     assert len(engine_core.scheduler.running) == 0
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize(
+    ("ec_role", "gpu_memory_utilization", "enable_prefix_caching"),
+    [
+        ("ec_producer", 0.01, False),
+        # NOTE: ec_producer never allows prefix caching
+        ("ec_consumer", 0.7, True),
+        ("ec_consumer", 0.7, False),
+    ],
+)
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_encoder_instance_zero_kv_cache(
+    ec_role: str,
+    gpu_memory_utilization: float,
+    enable_prefix_caching: bool,
+    use_kv_connector: bool,
+):
+    """EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests
+
+    This test verifies encoder-only instance initializes with 0 KV cache blocks.
+    Under EPD disagg mode, Encoder instances (EC producer role) only execute
+    vision encoder, so they don't need KV cache for text generation.
+    """
+    # Form vllm config
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        disable_hybrid_kv_cache_manager=True,
+    )
+    model_config = ModelConfig(
+        model="llava-hf/llava-1.5-7b-hf",  # Multimodal model
+        enforce_eager=True,
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=16,
+        gpu_memory_utilization=gpu_memory_utilization,
+        swap_space=0,
+        cache_dtype="auto",
+        enable_prefix_caching=enable_prefix_caching,
+    )
+    kv_transfer_config = (
+        KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        )
+        if use_kv_connector
+        else None
+    )
+    ec_transfer_config = ECTransferConfig(
+        ec_connector="ECSharedStorageConnector",
+        ec_role=ec_role,
+        ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test_encoder"},
+    )
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+        kv_transfer_config=kv_transfer_config,
+        ec_transfer_config=ec_transfer_config,
+    )
+
+    executor_class = Executor.get_class(vllm_config)
+    print(f"executor_class: {executor_class}")
+
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
+
+    # Check encoder cache manager exists
+    assert engine_core.scheduler.encoder_cache_manager is not None, (
+        "encoder_cache_manager should exist"
+    )
+
+    if ec_role == "ec_producer":
+        # Check 1: num_blocks should be 0
+        # NOTE: num_blocks=1 as BlockPool always needs a null_block.
+        kv_cache_config = engine_core.scheduler.kv_cache_manager.kv_cache_config
+        print(f"kv_cache_config: {kv_cache_config}")
+        assert kv_cache_config.num_blocks == 1, (
+            f"ec_producer should only have 1 KV blocks, "
+            f"got {kv_cache_config.num_blocks}"
+        )
+
+        # Check 2: kv_cache_groups should be empty
+        assert len(kv_cache_config.kv_cache_groups) == 0, (
+            f"ec_producer should have 0 KV cache groups, "
+            f"got {len(kv_cache_config.kv_cache_groups)}"
+        )
+
+        # Check 3: kv_cache_tensors should be empty
+        assert len(kv_cache_config.kv_cache_tensors) == 0, (
+            f"Encoder instance should have 0 KV cache tensors, "
+            f"got {len(kv_cache_config.kv_cache_tensors)}"
+        )
+
+        # Check 4: Verify EC connector is initialized and is producer
+        assert engine_core.scheduler.ec_connector is not None, (
+            "Encoder instance should have EC connector"
+        )
+        assert engine_core.scheduler.ec_connector.is_producer, (
+            "Encoder instance EC connector should be producer"
+        )
+
+        # Check 5: Verify chunked prefill is disabled
+        assert not vllm_config.scheduler_config.enable_chunked_prefill, (
+            "Encoder instance should disable chunked prefill (no KV cache)"
+        )
+
+    elif ec_role == "ec_consumer":
+        # Check 1: num_blocks should be > 1
+        kv_cache_config = engine_core.scheduler.kv_cache_manager.kv_cache_config
+        print(f"kv_cache_config: {kv_cache_config}")
+        assert kv_cache_config.num_blocks > 1, (
+            f"ec_consumer should have >1 KV blocks, got {kv_cache_config.num_blocks}"
+        )
+
+        # Check 2: kv_cache_groups should NOT be empty
+        assert len(kv_cache_config.kv_cache_groups) > 0, (
+            f"ec_consumer should have KV cache groups, "
+            f"got {len(kv_cache_config.kv_cache_groups)}"
+        )
+
+        # Check 3: Verify EC connector is consumer
+        assert engine_core.scheduler.ec_connector is not None, (
+            "Consumer instance should have EC connector"
+        )
+        assert not engine_core.scheduler.ec_connector.is_producer, (
+            "Consumer instance EC connector should be consumer"
+        )
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index d77a119ec60f..8e1198b315bd 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -49,10 +49,15 @@ def _ref_convert_id_to_token(
 @pytest.mark.parametrize(
     "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
 )
+@pytest.mark.parametrize("stream_interval", [1, 5, 10])
 def test_incremental_detokenization(
-    request_output_kind: RequestOutputKind, dummy_test_vectors
+    request_output_kind: RequestOutputKind,
+    stream_interval: int,
+    dummy_test_vectors,
 ):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
+    )
     engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
 
     # Make N requests.
@@ -104,9 +109,18 @@ def test_incremental_detokenization(
             if request_id not in gen_strings:
                 gen_strings[request_id] = new_text
                 gen_tokens[request_id] = new_tokens
+                if request_output_kind == RequestOutputKind.DELTA:
+                    assert len(new_tokens) == 1, f"{len(new_tokens)=}"
             else:
                 gen_strings[request_id] += new_text
                 gen_tokens[request_id].extend(new_tokens)
+                if (
+                    request_output_kind == RequestOutputKind.DELTA
+                    and not request_output.finished
+                ):
+                    assert len(new_tokens) >= stream_interval, (
+                        f"{len(new_tokens)=}, {stream_interval=}"
+                    )
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
diff --git a/tests/v1/engine/test_parallel_sampling.py b/tests/v1/engine/test_parallel_sampling.py
new file mode 100644
index 000000000000..736c0e54837f
--- /dev/null
+++ b/tests/v1/engine/test_parallel_sampling.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import SamplingParams
+from vllm.outputs import CompletionOutput
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.parallel_sampling import ParentRequest
+
+
+def test_parent_request_to_output_stream() -> None:
+    parent_request = ParentRequest("parent_id", SamplingParams(n=2))
+    parent_request.child_requests = {"child_id_0", "child_id_1"}
+    output_0 = CompletionOutput(
+        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    output_1 = CompletionOutput(
+        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    # Request not finished
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
+
+    # output_1 finished
+    output_1.finish_reason = "ended"
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
+    # Finished output_1 had already returned, DO NOT returned again
+    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == (
+        "parent_id",
+        [],
+        False,
+    )
+
+    # output_0 finished
+    output_0.finish_reason = "ended"
+    assert ("parent_id", [output_0], True) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    # Finished output_0 had already returned, DO NOT returned again
+    assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
+    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+
+
+def test_parent_request_to_output_final_only() -> None:
+    parent_request = ParentRequest(
+        "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
+    )
+    parent_request.child_requests = {"child_id_0", "child_id_1"}
+    output_0 = CompletionOutput(
+        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    output_1 = CompletionOutput(
+        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
+    )
+    # Request not finished, return nothing
+    assert parent_request.get_outputs("child_id_0", output_0) == (
+        "parent_id",
+        [],
+        False,
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == (
+        "parent_id",
+        [],
+        False,
+    )
+    # output_1 finished, but outputs won't be returned until all child requests finished
+    output_1.finish_reason = "ended"
+    assert parent_request.get_outputs("child_id_0", output_0) == (
+        "parent_id",
+        [],
+        False,
+    )
+    assert parent_request.get_outputs("child_id_1", output_1) == (
+        "parent_id",
+        [],
+        False,
+    )
+    # output_0 finished, as all child requests finished, the output would be returned
+    output_0.finish_reason = "ended"
+    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+        "child_id_0", output_0
+    )
+    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+        "child_id_1", output_1
+    )
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 23684a2c55ce..3541ef89bfc1 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -5,6 +5,7 @@
 from dataclasses import dataclass
 from typing import TypeAlias
 
+import numpy as np
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -369,9 +370,9 @@ def get_outputs(self) -> list[EngineCoreOutput]:
                         self.generated_logprobs_raw[req_idx][token_idx]
                     )
                     logprobs = LogprobsLists(
-                        [logprobs_token_ids_],
-                        [logprobs_],
-                        [sampled_token_ranks_],
+                        np.array([logprobs_token_ids_]),
+                        np.array([logprobs_]),
+                        np.array([sampled_token_ranks_]),
                     )
                 else:
                     logprobs = None
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 4cd26e7b41d3..a7d769c8542a 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -677,9 +677,14 @@ def test_structured_output_with_reasoning_matrices(
     reasoning, content = run_reasoning_extraction(reasoner, [generated_text])
     print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}")
 
-    assert content is not None and reasoning is not None
-    output_json = json.loads(content)
-    jsonschema.validate(instance=output_json, schema=reasoning_schema)
+    if "Qwen3" in model_name:
+        assert content is not None
+
+    assert reasoning is not None
+
+    if content is not None:
+        output_json = json.loads(content)
+        jsonschema.validate(instance=output_json, schema=reasoning_schema)
 
 
 @pytest.mark.skip_global_cleanup
diff --git a/tests/v1/entrypoints/openai/serving_responses/conftest.py b/tests/v1/entrypoints/openai/serving_responses/conftest.py
index 8081e5fa1d83..b948b6d058a5 100644
--- a/tests/v1/entrypoints/openai/serving_responses/conftest.py
+++ b/tests/v1/entrypoints/openai/serving_responses/conftest.py
@@ -30,7 +30,10 @@ def server_with_store(default_server_args):
     with RemoteOpenAIServer(
         MODEL_NAME,
         default_server_args,
-        env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
+        env_dict={
+            "VLLM_ENABLE_RESPONSES_API_STORE": "1",
+            "VLLM_SERVER_DEV_MODE": "1",
+        },
     ) as remote_server:
         yield remote_server
 
diff --git a/tests/v1/entrypoints/openai/serving_responses/test_function_call.py b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
index cf57956a9dea..90161e7c221b 100644
--- a/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_function_call.py
@@ -116,6 +116,7 @@ async def test_function_tool_use(
         input=prompt,
         tools=tools,
         tool_choice=tool_choice,
+        temperature=0.0,
     )
 
     assert len(response.output) >= 1
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py
index 522c72b55955..b5aa20448dfc 100644
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -138,3 +138,23 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
                 "structured_outputs": {"grammar": invalid_simplified_sql_grammar}
             },
         )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_empty_grammar(client: openai.AsyncOpenAI, model_name: str) -> None:
+    prompt = "Say hello"
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            extra_body={"structured_outputs": {"grammar": ""}},
+        )
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index a9817313cf02..ebc8575e5b39 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -49,6 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
+PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16}
+DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -136,6 +138,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
+    --block-size ${PREFILL_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
@@ -177,6 +180,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
+    --block-size ${DECODE_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --kv-transfer-config '$KV_CONFIG'"
   
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 475cf2285e39..b264e5108c16 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -11,6 +11,7 @@
 from collections import defaultdict
 from unittest.mock import patch
 
+import numpy as np
 import pytest
 import ray
 import torch
@@ -407,6 +408,7 @@ def _nixl_handshake(
                 # `self.kv_cache_layout` is only forced to HND when vllm engine
                 # is started. We mock HND here.
                 kv_cache_layout="HND",
+                block_size=self.block_size,
             ),
             remote_tp_size=remote_tp_size,
         )
@@ -652,6 +654,7 @@ def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init):
                 block_lens=worker.block_len_per_layer,
                 attn_backend_name=worker.backend_name,
                 kv_cache_layout=mismatched_layout,
+                block_size=worker.block_size,
             )
 
             with pytest.raises(RuntimeError):
@@ -706,6 +709,7 @@ def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
                 block_lens=[i * 2 for i in worker.block_len_per_layer],
                 attn_backend_name=worker.backend_name,
                 kv_cache_layout="HND",
+                block_size=worker.block_size,
             )
 
             # We don't check layout for homogeneous TP and MLA for now, as the
@@ -823,7 +827,7 @@ def test_kv_connector_stats_aggregation():
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[[123]],  # dummy token
+            sampled_token_ids=[np.array([123])],  # dummy token
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
@@ -904,7 +908,7 @@ def make_multi_stats(nixl_count: int, foo_count: int) -> MultiKVConnectorStats:
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[[123]],
+            sampled_token_ids=[np.array([123])],
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
@@ -962,7 +966,7 @@ def test_scheduler_kv_connector_stats_aggregation():
     model_output = ModelRunnerOutput(
         req_ids=["req_0"],
         req_id_to_index={"req_0": 0},
-        sampled_token_ids=[[123]],
+        sampled_token_ids=[np.array([123])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[None],
@@ -1096,7 +1100,8 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
     llm.llm_engine.engine_core.shutdown()
 
 
-def test_register_kv_caches(dist_init):
+@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "TRITON_ATTN"])
+def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
     """
     Test that register_kv_caches() properly calls nixl_wrapper methods with
     correct data.
@@ -1108,10 +1113,22 @@ def test_register_kv_caches(dist_init):
        block layout info
     """
 
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+
     vllm_config = create_vllm_config()
 
+    # Import the appropriate backend based on the parameter
+    if attn_backend == "FLASH_ATTN":
+        from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+
+        backend_cls = FlashAttentionBackend
+    else:  # TRITON_ATTN
+        from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend
+
+        backend_cls = TritonAttentionBackend
+
     # Create test kv cache tensors using proper backend shape
-    kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
+    kv_cache_shape = backend_cls.get_kv_cache_shape(
         num_blocks=2, block_size=16, num_kv_heads=4, head_size=64
     )
     shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index f0031643aa9d..c248104d5b5e 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -3,9 +3,11 @@
 import tempfile
 from collections import defaultdict
 from collections.abc import Callable
-from itertools import count
+from dataclasses import dataclass
+from itertools import chain, count
 from typing import Any
 
+import numpy as np
 import torch
 
 from vllm import SamplingParams
@@ -18,13 +20,18 @@
     VllmConfig,
 )
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
     SharedStorageConnector,
 )
 from vllm.utils.hashing import sha256
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
-from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.core.sched.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.kv_cache_interface import (
     FullAttentionSpec,
     KVCacheConfig,
@@ -222,7 +229,7 @@ def create_model_runner_output(
 
     # Make sampled tokens.
     sampled_token = EOS_TOKEN_ID if use_eos else token_id
-    sampled_token_ids = [[sampled_token] for _ in req_ids]
+    sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
 
     kv_connector_output = (
         None
@@ -307,6 +314,82 @@ def wrapper(*args, **kwargs):
         return attr
 
 
+@dataclass(frozen=True)
+class MockKVConfig:
+    matched_tokens: int = 0
+    is_async: bool = False
+
+
+class MockKVConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        # Scheduler tests check metadata.requests
+        self.requests: list = []
+
+
+class MockKVConnector(KVConnectorBase_V1):
+    """Mock KV connector for scheduler tests, supporting both sync and async mode."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: KVCacheConfig | None = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+        extra_config = self._kv_transfer_config.kv_connector_extra_config
+        self.config = MockKVConfig(
+            matched_tokens=extra_config["matched_tokens"],
+            is_async=extra_config["is_async"],
+        )
+
+    def get_num_new_matched_tokens(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        return (self.config.matched_tokens, self.config.is_async)
+
+    def update_state_after_alloc(
+        self,
+        request: Request,
+        blocks: KVCacheBlocks,
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        metadata = MockKVConnectorMetadata()
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for req_id in chain(
+            (req.req_id for req in scheduler_output.scheduled_new_reqs),
+            (
+                req_id
+                for req_id in cached_reqs.req_ids
+                if req_id in cached_reqs.resumed_req_ids
+            ),
+        ):
+            metadata.requests.append({"req_id": req_id})
+        return metadata
+
+    def start_load_kv(self, kv_caches, finished_req_ids):
+        pass
+
+    def wait_for_layer_load(self, layer_name):
+        pass
+
+    def save_kv_layer(self, layer_name, kv_layer, attn_metadata, **kwargs):
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
 KVConnectorFactory.register_connector(
     "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__
 )
+
+KVConnectorFactory.register_connector(
+    "MockKVConnector", __name__, MockKVConnector.__name__
+)
diff --git a/tests/v1/kv_offload/test_cpu_manager.py b/tests/v1/kv_offload/test_cpu_manager.py
index 4f90ca022cef..839cd9b6dc55 100644
--- a/tests/v1/kv_offload/test_cpu_manager.py
+++ b/tests/v1/kv_offload/test_cpu_manager.py
@@ -11,6 +11,7 @@
     OffloadingEvent,
     PrepareStoreOutput,
 )
+from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
 from vllm.v1.kv_offload.backends.cpu import CPUBackend
 from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
@@ -187,3 +188,310 @@ def test_cpu_manager():
         expected_stores=({3, 4, 5}, {6, 7, 8}),
         expected_evictions=({2, 3, 4}, {8}),
     )
+
+
+def test_arc_manager_basic():
+    """
+    Tests ARCOffloadingManager basic operations with a CPUBackend.
+    Verifies that ARC handles store, load, and lookup operations correctly.
+    """
+    # initialize a CPU backend with a capacity of 4 blocks
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare store [1, 2]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2],
+            store_block_ids=[0, 1],
+            block_hashes_evicted=[],
+        ),
+    )
+
+    # lookup [1, 2] -> not ready
+    assert arc_manager.lookup(to_hashes([1, 2])) == 0
+
+    # no events so far
+    assert list(arc_manager.take_events()) == []
+
+    # complete store [1, 2]
+    arc_manager.complete_store(to_hashes([1, 2]))
+    verify_events(
+        arc_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
+    )
+
+    # lookup [1, 2]
+    assert arc_manager.lookup(to_hashes([1])) == 1
+    assert arc_manager.lookup(to_hashes([1, 2])) == 2
+    assert arc_manager.lookup(to_hashes([1, 2, 3])) == 2
+
+    # blocks should be in T1 (recent)
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 0
+
+
+def test_arc_manager_t1_to_t2_promotion():
+    """
+    Tests that accessing a block in T1 promotes it to T2 (frequent).
+    This is a key feature of ARC's adaptive behavior.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store and complete block 1
+    arc_manager.prepare_store(to_hashes([1]))
+    arc_manager.complete_store(to_hashes([1]))
+
+    # block 1 starts in T1 (recent)
+    assert to_hashes([1])[0] in arc_manager.t1
+    assert to_hashes([1])[0] not in arc_manager.t2
+
+    # touch block 1 (simulate second access)
+    arc_manager.touch(to_hashes([1]))
+
+    # block 1 should now be in T2 (frequent)
+    assert to_hashes([1])[0] not in arc_manager.t1
+    assert to_hashes([1])[0] in arc_manager.t2
+
+
+def test_arc_manager_eviction_with_load():
+    """
+    Tests ARC eviction behavior similar to LRU test.
+    Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare and complete store [1, 2, 3, 4]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2, 3, 4],
+            store_block_ids=[0, 1, 2, 3],
+            block_hashes_evicted=[],
+        ),
+    )
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # prepare load [2, 3] (increases ref_cnt)
+    prepare_load_output = arc_manager.prepare_load(to_hashes([2, 3]))
+    verify_load_output(prepare_load_output, [1, 2])
+
+    # prepare store [5, 6, 7] with [2, 3] being loaded
+    # should fail because [2, 3] have ref_cnt > 0
+    assert arc_manager.prepare_store(to_hashes([5, 6, 7])) is None
+
+    # complete load [2, 3]
+    arc_manager.complete_load(to_hashes([2, 3]))
+
+    # now prepare store [5, 6, 7] should succeed
+    # ARC will evict blocks one at a time from T1 as needed
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5, 6, 7]))
+    assert prepare_store_output is not None
+    # Should successfully evict enough blocks to make room (at least 1)
+    assert len(prepare_store_output.block_hashes_evicted) >= 1
+
+
+def test_arc_manager_adaptive_target():
+    """
+    Tests ARC's adaptive target adjustment via ghost lists.
+    When a block in B1 (ghost list) is accessed, target_t1_size increases.
+    When a block in B2 is accessed, target_t1_size decreases.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store blocks 1, 2 (fills cache)
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    initial_target = arc_manager.target_t1_size
+
+    # store block 3, evicting block 1 (moves to B1 ghost list)
+    arc_manager.prepare_store(to_hashes([3]))
+    arc_manager.complete_store(to_hashes([3]))
+
+    # block 1 should be in B1 (ghost list)
+    assert to_hashes([1])[0] in arc_manager.b1
+
+    # touch block 1 (cache miss, but in B1)
+    # this should increase target_t1_size (favor recency)
+    arc_manager.touch(to_hashes([1]))
+
+    # target should have increased
+    assert arc_manager.target_t1_size > initial_target
+
+
+def test_arc_manager_t1_t2_eviction_policy():
+    """
+    Tests that ARC evicts from T1 or T2 based on target_t1_size.
+    If |T1| >= target_t1_size, evict from T1, otherwise from T2.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # promote blocks 3, 4 to T2 by touching them
+    arc_manager.touch(to_hashes([3, 4]))
+
+    # now: T1 = {1, 2}, T2 = {3, 4}
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 2
+
+    # set target_t1_size to prefer evicting from T1
+    # (when |T1| >= target, evict from T1)
+    arc_manager.target_t1_size = 1
+
+    # store block 5, should evict from T1 (block 1, LRU in T1)
+    output = arc_manager.prepare_store(to_hashes([5]))
+    assert output is not None
+    assert to_hashes([1]) == output.block_hashes_evicted
+
+    arc_manager.complete_store(to_hashes([5]))
+
+    # block 1 should be in B1 (ghost list)
+    assert to_hashes([1])[0] in arc_manager.b1
+    # block 5 should be in T1
+    assert to_hashes([5])[0] in arc_manager.t1
+
+
+def test_arc_manager_ghost_list_bounds():
+    """
+    Tests that ghost lists (B1, B2) don't grow unbounded.
+    They should be capped at cache_capacity.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # fill cache with blocks 1, 2
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    # store many blocks to fill ghost lists
+    for i in range(3, 20):
+        arc_manager.prepare_store(to_hashes([i]))
+        arc_manager.complete_store(to_hashes([i]))
+
+    # ghost lists should not exceed cache_capacity
+    assert len(arc_manager.b1) <= arc_manager.cache_capacity
+    assert len(arc_manager.b2) <= arc_manager.cache_capacity
+
+
+def test_arc_manager_touch_ordering():
+    """
+    Tests that touch() correctly updates access patterns.
+    Similar to LRU test but verifies T1/T2 ordering.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # promote 3, 4 to T2
+    arc_manager.touch(to_hashes([3, 4]))
+
+    # T1 = {1, 2}, T2 = {3, 4}
+    # touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
+    arc_manager.touch(to_hashes([1, 3, 4]))
+
+    # T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
+    assert len(arc_manager.t1) == 1
+    assert len(arc_manager.t2) == 3
+
+    # store block 5, should evict from T1 (block 2, only one in T1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[5],
+            store_block_ids=[1],  # reuses block 2's storage
+            block_hashes_evicted=[2],
+        ),
+    )
+
+
+def test_arc_manager_failed_store():
+    """
+    Tests that failed store operations clean up correctly.
+    Similar to LRU test but for ARC.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # prepare store block 5 (will evict block 1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
+    assert prepare_store_output is not None
+    assert len(prepare_store_output.block_hashes_evicted) == 1
+
+    # complete store with failure
+    arc_manager.complete_store(to_hashes([5]), success=False)
+
+    # block 5 should not be in cache
+    assert arc_manager.lookup(to_hashes([5])) == 0
+    # block 5 should not be in T1 or T2
+    assert to_hashes([5])[0] not in arc_manager.t1
+    assert to_hashes([5])[0] not in arc_manager.t2
+
+    # evicted block should still be gone (in B1 ghost list)
+    evicted_hash = prepare_store_output.block_hashes_evicted[0]
+    assert evicted_hash in arc_manager.b1
+
+
+def test_arc_manager_full_scenario():
+    """
+    Comprehensive test covering multiple ARC operations in sequence.
+    Similar to the full LRU test but adapted for ARC behavior.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store [1, 2]
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    # store [3, 4, 5] -> evicts [1]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([3, 4, 5]))
+    assert prepare_store_output is not None
+    assert len(prepare_store_output.block_hashes_evicted) == 1
+    arc_manager.complete_store(to_hashes([3, 4, 5]))
+
+    # promote some blocks to T2
+    arc_manager.touch(to_hashes([2, 3]))
+
+    # T1 has {4, 5}, T2 has {2, 3}
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 2
+
+    # store [6] -> should evict from T1 (4 is oldest in T1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([6]))
+    assert prepare_store_output is not None
+    arc_manager.complete_store(to_hashes([6]))
+
+    # verify blocks 2, 3 (in T2) are still present
+    assert arc_manager.lookup(to_hashes([2])) == 1
+    assert arc_manager.lookup(to_hashes([3])) == 1
+
+    # verify events
+    events = list(arc_manager.take_events())
+    assert len(events) > 0  # should have store and eviction events
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 354fff22dc2a..42584938bc06 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -47,6 +47,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
         max_num_batched_tokens=16,
         max_num_seqs=16,
         max_model_len=128,
+        enable_chunked_prefill=True,
         enforce_eager=True,
         # TODO: enable this once we support it for
         # prompt logprobs.
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 47d05a20a65d..805b8c86b080 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -3,6 +3,7 @@
 
 from unittest import mock
 
+import numpy as np
 import pytest
 import torch
 
@@ -13,7 +14,7 @@
     create_standard_kv_cache_spec,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     DeviceConfig,
@@ -112,7 +113,9 @@ def test_prepare_next_token_ids():
     sampled_token_ids_tensor = torch.tensor(
         sampled_token_ids, dtype=torch.int32, device=device
     )
-    sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids]
+    sampled_token_ids_cpu = [
+        np.array([i for i in seq if i != -1]) for seq in sampled_token_ids
+    ]
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
     expected_next_token_ids_tensor = torch.tensor(
@@ -321,6 +324,7 @@ def test_prepare_inputs_padded():
 @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("pp_size", [1, 2])
 @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
+@pytest.mark.parametrize("use_distinct_lm_head", [True, False])
 @mock.patch("vllm.v1.spec_decode.eagle.get_pp_group")
 @mock.patch("vllm.v1.spec_decode.eagle.get_layers_from_vllm_config")
 @mock.patch("vllm.v1.spec_decode.eagle.get_model")
@@ -332,6 +336,7 @@ def test_load_model(
     attn_backend,
     pp_size,
     use_distinct_embed_tokens,
+    use_distinct_lm_head,
     monkeypatch,
 ):
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
@@ -347,12 +352,13 @@ def test_load_model(
 
     # Setup draft model mock
     mock_model = mock.MagicMock()
+    mock_model.model = mock.MagicMock()
+    mock_model.has_own_embed_tokens = use_distinct_embed_tokens
     if use_distinct_embed_tokens:
-        # Some models can have a different hidden size than the target model,
-        # so we test that their embed_tokens doesn't get overwritten
-        mock_model.model.embed_tokens.weight.shape = (131072, 2048)
-    else:
-        mock_model.model.embed_tokens.weight.shape = (131072, 4096)
+        mock_model.model.embed_tokens = mock.MagicMock()
+    mock_model.has_own_lm_head = use_distinct_lm_head
+    if use_distinct_lm_head:
+        mock_model.lm_head = mock.MagicMock()
 
     mock_get_model.return_value = mock_model
 
@@ -388,15 +394,13 @@ class _TargetModelStub(LlamaForCausalLM):
 
     target_model = mock.create_autospec(_TargetModelStub, instance=True)
     target_model.model = mock.MagicMock()
-    target_model.model.embed_tokens.weight.shape = (131072, 4096)
+    target_model.lm_head = mock.MagicMock()
+    target_model.model.embed_tokens = mock.MagicMock()
 
     from vllm.model_executor.models import SupportsMultiModal
 
     assert not isinstance(target_model, SupportsMultiModal)
 
-    if method == "eagle":
-        target_model.lm_head = mock.MagicMock()
-
     # Create proposer using the helper function
     proposer = _create_proposer(method, num_speculative_tokens=8)
 
@@ -406,18 +410,18 @@ class _TargetModelStub(LlamaForCausalLM):
     # Verify common interactions
     mock_get_model.assert_called_once()
 
-    # Verify that EAGLE models gain the lm head from the target model
-    if method == "eagle":
-        assert proposer.model.lm_head == target_model.lm_head
+    # Verify that the lm head is set correctly
+    if use_distinct_lm_head:
+        assert proposer.model.lm_head is not target_model.lm_head
+    else:
+        assert proposer.model.lm_head is target_model.lm_head
 
     # Verify that the embed tokens are set correctly
     # If pp_size is > 1, the embed tokens should be distinct
     if pp_size > 1 or use_distinct_embed_tokens:
-        assert proposer.model.model.embed_tokens != target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens is not target_model.model.embed_tokens
     else:
-        # When pp_size is 1 and the draft and target models have
-        # embed_tokens of the same shape, they should be shared.
-        assert proposer.model.model.embed_tokens == target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens is target_model.model.embed_tokens
 
 
 @pytest.mark.parametrize("method", ["eagle", "eagle3"])
@@ -534,11 +538,17 @@ def create_deterministic_logits(token_ids):
     sampling_metadata = mock.MagicMock()
 
     if attn_backend == "FLASH_ATTN":
-        attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.FLASH_ATTN)
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.FLASH_ATTN
+        )
     elif attn_backend == "TRITON_ATTN":
-        attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.TRITON_ATTN)
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.TRITON_ATTN
+        )
     elif attn_backend == "TREE_ATTN":
-        attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.TREE_ATTN)
+        attn_metadata_builder_cls, _ = try_get_attention_backend(
+            AttentionBackendEnum.TREE_ATTN
+        )
     else:
         raise ValueError(f"Unsupported attention backend: {attn_backend}")
 
@@ -673,7 +683,9 @@ def create_deterministic_logits(token_ids, k: int):
     proposer.attn_layer_names = ["layer.0"]
 
     # Get the tree attention metadata builder.
-    attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.TREE_ATTN)
+    attn_metadata_builder_cls, _ = try_get_attention_backend(
+        AttentionBackendEnum.TREE_ATTN
+    )
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
         layer_names=proposer.attn_layer_names,
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 9ca7cf9e3e0e..c5c0491abaf7 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -12,7 +12,7 @@
     create_standard_kv_cache_spec,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     DeviceConfig,
@@ -67,6 +67,10 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers, mock_get_pp_gro
     mock_model = mock.MagicMock()
     mock_model.model.embed_tokens.weight.shape = (131072, 4096)
     mock_get_model.return_value = mock_model
+    # MTP does not have its own embed_tokens or lm_head
+    # so it should share them with the target model
+    mock_model.has_own_embed_tokens = False
+    mock_model.has_own_lm_head = False
 
     target_attn_layers = {"target_attn_1": mock.MagicMock()}
     all_attn_layers = {**target_attn_layers, "draft_attn_1": mock.MagicMock()}
@@ -177,7 +181,9 @@ def create_deterministic_logits(batch_size, vocab_size, token_offset):
     sampling_metadata = mock.MagicMock()
 
     # Setup attention metadata
-    attn_metadata_builder_cls, _ = try_get_attention_backend(_Backend.FLASH_ATTN)
+    attn_metadata_builder_cls, _ = try_get_attention_backend(
+        AttentionBackendEnum.FLASH_ATTN
+    )
 
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 692c39282c37..563bc1d957f4 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -77,7 +77,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match.
     token_ids_cpu = np.array([[1, 2, 3, 4, 5]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -88,7 +88,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -99,7 +99,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram but match for 3-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -111,7 +111,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # In this case, the proposer should return the 4-gram match.
     token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -122,7 +122,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Match for 2-gram and 3-gram, but not 4-gram.
     token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -133,7 +133,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Multiple 3-gram matched, but always pick the first one.
     token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -144,7 +144,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # check empty input
     token_ids_cpu = np.array([[]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -157,7 +157,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # second request has 3 tokens and no match. Padded with -1 for max len 5
     token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0], [1]],
+        sampled_token_ids=[np.array([0]), np.array([1])],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([5, 3]),
         token_ids_cpu=token_ids_cpu,
@@ -181,7 +181,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     input_2[:3] = [4, 5, 6]
     token_ids_cpu = np.array([input_1, input_2])
     result = ngram_proposer.propose(
-        sampled_token_ids=[[0], [1]],
+        sampled_token_ids=[np.array([0]), np.array([1])],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([len(input_1), 3]),
         token_ids_cpu=token_ids_cpu,
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index b365e75d5514..6958d62dc7e9 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -10,7 +10,7 @@
     create_vllm_config,
     try_get_attention_backend,
 )
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 
@@ -35,7 +35,7 @@ def forward_attention(
     block_table: torch.Tensor,
     slot_mapping: torch.Tensor,
     seqlen_k: int,
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     spec_token_tree: str | None = None,
     num_spec_tokens: int = 0,
 ) -> torch.Tensor:
@@ -241,7 +241,7 @@ def test_tree_attn_correctness() -> None:
                         block_table=block_table,
                         slot_mapping=tree_slot_mapping,
                         seqlen_k=seqlen_k,
-                        backend=_Backend.TREE_ATTN,
+                        backend=AttentionBackendEnum.TREE_ATTN,
                         spec_token_tree=spec_token_tree,
                         num_spec_tokens=tree_size_q - 1,
                     ).view(batch_size, -1, num_heads, dim_per_head)
@@ -278,7 +278,7 @@ def test_tree_attn_correctness() -> None:
                             block_table=block_table,
                             slot_mapping=branch_slot_mapping,
                             seqlen_k=sequence_position + q_len,
-                            backend=_Backend.FLASH_ATTN,
+                            backend=AttentionBackendEnum.FLASH_ATTN,
                         ).view(batch_size, -1, num_heads, dim_per_head)
 
                         # Compare the outputs.
diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py
new file mode 100644
index 000000000000..771076186a3b
--- /dev/null
+++ b/tests/v1/structured_output/test_backend_guidance.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import AutoTokenizer
+
+from vllm.config import StructuredOutputsConfig, VllmConfig
+from vllm.config.model import ModelConfig
+from vllm.config.speculative import SpeculativeConfig
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
+from vllm.v1.request import Request
+from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.structured_output.backend_guidance import GuidanceBackend
+from vllm.v1.structured_output.backend_types import StructuredOutputOptions
+
+TOKENIZER = "gpt2"
+
+
+def test_backend_guidance_rollback_terminated():
+    # Test that the backend guidance successfully rollbacks from a
+    # terminated state. This can happen with speculative decoding,
+    # where the draft model proposes EOS and it is verified by the
+    # guidance backend. In that case we are in a stopped state, but
+    # it should be reverted in case EOS is not accepted by the target
+    # model.
+    vllm_config = VllmConfig(
+        decoding_config=StructuredOutputsConfig(
+            backend="guidance",
+        )
+    )
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
+
+    backend = GuidanceBackend(
+        vllm_config,
+        tokenizer=tokenizer,
+        vocab_size=50257,
+    )
+
+    grammar = backend.compile_grammar(
+        StructuredOutputOptions.JSON, '{"type": "object"}'
+    )
+
+    prompt = tokenizer.encode('{"a": "b"}')
+    assert len(prompt) > 1
+    dummy_wrong = tokenizer.encode('{"a"}')
+    for token in prompt:
+        assert grammar.accept_tokens("", [token])
+    assert not grammar.is_terminated()
+    assert grammar.accept_tokens("", [tokenizer.eos_token_id])
+    assert grammar.is_terminated()
+    # Giving any other token should also be accepted
+    assert grammar.accept_tokens("", dummy_wrong)
+    # Rollback is done from where state was terminated, so from '}' not EOS
+    grammar.rollback(len(prompt) - 1)
+    assert not grammar.is_terminated()
+    assert grammar.validate_tokens([tokenizer.eos_token_id]) == []
+    assert grammar.validate_tokens(dummy_wrong) != dummy_wrong
+    assert grammar.accept_tokens("", prompt[1:])
+    assert not grammar.is_terminated()
+    assert grammar.accept_tokens("", [tokenizer.eos_token_id])
+    assert grammar.is_terminated()
+    # Rollback of <= 0 should not change the terminated state
+    grammar.rollback(0)
+    assert grammar.is_terminated()
+    grammar.rollback(-1)
+    assert grammar.is_terminated()
+
+
+def test_grammar_bitmask_with_specdec():
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
+    prompt = tokenizer.encode('{"a": "b"}')
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(tokenizer=TOKENIZER),
+        structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
+        speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
+    )
+    structured_output_manager = StructuredOutputManager(vllm_config)
+
+    for i in range(1, 2):
+        sampling_params = SamplingParams(
+            structured_outputs=StructuredOutputsParams(
+                json='{"type": "object"}',
+            ),
+        )
+        sampling_params.structured_outputs._backend = "guidance"
+
+        my_req_id = f"my_req_id_{i}"
+        request = Request(
+            my_req_id,
+            prompt_token_ids=prompt[:i],
+            sampling_params=sampling_params,
+            pooling_params=None,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+
+        structured_output_manager.grammar_init(request)
+
+        def grammar_bitmask(req: Request, tokens: list[int]) -> None:
+            structured_output_manager.grammar_bitmask(
+                requests={req.request_id: req},
+                structured_output_request_ids={req.request_id: 0},
+                scheduled_spec_decode_tokens={req.request_id: tokens},
+            )
+            # At this point, we rolled-back, so should not be terminated
+            assert not req.structured_output_request.grammar.is_terminated()
+
+        # The grammar might not yet be compiled, so we wait for it
+        while not request.structured_output_request._check_grammar_completion():
+            continue
+
+        assert request.structured_output_request.grammar.accept_tokens(
+            request.request_id, prompt[:i]
+        )
+
+        grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
+        grammar_bitmask(
+            request, prompt[i:] + [tokenizer.eos_token_id] + prompt
+        )  # EOS not the final token
+        grammar_bitmask(request, prompt[i:])  # EOS not present
+        grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index bc624658308b..b95c8df3469b 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -185,9 +185,7 @@ def _make_mock_backend_for_kernel_block_size(
     supported_sizes: list[int | MultipleOf],
 ):
     class _MockBackend:
-        @staticmethod
-        def get_supported_kernel_block_size():
-            return supported_sizes
+        supported_kernel_block_sizes = supported_sizes
 
     return _MockBackend()
 
@@ -466,13 +464,20 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
     # This test checks if GPUModelRunner initializes correctly when an attention
     # backend enforces a non-default KV cache stride order.
     n_heads = model_runner.model_config.get_num_kv_heads(model_runner.parallel_config)
-    expected_kv_cache_shape = [
-        2,
-        NUM_BLOCKS,
-        BLOCK_SIZE,
-        n_heads,
-        model_runner.model_config.get_head_size(),
-    ]
+    head_size = model_runner.model_config.get_head_size()
+
+    # Get the expected shape from the backend's get_kv_cache_shape method
+    # to ensure compatibility with different backends (triton vs flexattention)
+    attn_backend = None
+    for attn_group in model_runner._attn_group_iterator():
+        attn_backend = attn_group.backend
+        break
+
+    assert attn_backend is not None, "No attention backend found"
+    expected_kv_cache_shape = list(
+        attn_backend.get_kv_cache_shape(NUM_BLOCKS, BLOCK_SIZE, n_heads, head_size)
+    )
+
     # TODO mla test
     default_stride = tuple(range(5))
     # Permutation that gets you back to expected kv shape
@@ -980,8 +985,10 @@ def test_hybrid_block_table_initialization():
     req_index = 0
     block_table.append_row(kvcache_manager_blocks, req_index)
     # Get expected kernel blocks from the implementation for verification.
-    expected_kernel_blocks = block_table._map_to_kernel_blocks(
-        np.array(kvcache_manager_blocks)
+    expected_kernel_blocks = block_table.map_to_kernel_blocks(
+        np.array(kvcache_manager_blocks),
+        block_table.blocks_per_kv_block,
+        block_table._kernel_block_arange,
     )
     # Verify block table state
     assert block_table.num_blocks_per_row[req_index] == len(expected_kernel_blocks)
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index 4a20b6b7bb8f..a786abba95ad 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -175,6 +175,7 @@ def build_and_install_prerequisites(args):
     build_env["LD_LIBRARY_PATH"] = (
         f"{ucx_lib_path}:{ucx_plugin_path}:{existing_ld_path}".strip(":")
     )
+    build_env["LDFLAGS"] = "-Wl,-rpath,$ORIGIN"
     print(f"--> Using LD_LIBRARY_PATH: {build_env['LD_LIBRARY_PATH']}", flush=True)
 
     temp_wheel_dir = os.path.join(ROOT_DIR, "temp_wheelhouse")
diff --git a/tools/vllm-tpu/build.sh b/tools/vllm-tpu/build.sh
index fbc91e379df3..45ef8dfcb1db 100755
--- a/tools/vllm-tpu/build.sh
+++ b/tools/vllm-tpu/build.sh
@@ -7,6 +7,13 @@ TOOLS_DIR=$(cd "$(dirname "$SCRIPT_PATH_PARAM")" && pwd) # Absolute path to the
 REPO_ROOT=$(cd "$TOOLS_DIR/../../" && pwd) # Absolute path to the repo root
 VLLM_DIR="$REPO_ROOT/" # Path to the vllm sources
 
+CHANGE_FILE_LIST=(
+  "vllm/entrypoints/cli/main.py"
+  "vllm/entrypoints/cli/run_batch.py"
+  "vllm/utils/__init__.py"
+  "vllm/platforms/__init__.py"
+)
+
 # Ensure we are not running from within the vllm directory if SCRIPT_PATH_PARAM is relative like "."
 if [ "$TOOLS_DIR" = "$VLLM_DIR" ]; then
     echo "Error: This script should not be run from the vllm directory directly if using relative paths."
@@ -30,6 +37,20 @@ if ! grep -q "name = \"vllm-tpu\"" "$PYPROJECT_FILE"; then
     echo "Patching pyproject.toml project name to vllm-tpu..."
     cp "$PYPROJECT_FILE" "${PYPROJECT_FILE}.bak"
     sed -i '0,/^name = "vllm"/s//name = "vllm-tpu"/' "$PYPROJECT_FILE"
+
+    echo "Patching ${CHANGE_FILE_LIST[@]} vllm to vllm-tpu..."
+    # patching
+    #   importlib.metadata.version('vllm') -> importlib.metadata.version('vllm-tpu')
+    #   importlib.metadata.version("vllm") -> importlib.metadata.version("vllm-tpu")
+    #   importlib.metadata.metadata('vllm') -> importlib.metadata.metadata('vllm-tpu')
+    #   importlib.metadata.metadata("vllm") -> importlib.metadata.metadata("vllm-tpu")
+    #   version('vllm') -> version('vllm-tpu')
+    #   version("vllm") -> version("vllm-tpu")
+    sed -i \
+        -e "s/importlib.metadata.version(\(['\"]\)vllm\1)/importlib.metadata.version(\1vllm-tpu\1)/" \
+        -e "s/importlib.metadata.metadata(\(['\"]\)vllm\1)/importlib.metadata.metadata(\1vllm-tpu\1)/" \
+        -e "s/version(\(['\"]\)vllm\1)/version(\1vllm-tpu\1)/" \
+        "${CHANGE_FILE_LIST[@]}"
     PATCHED=true
 else
     PATCHED=false
@@ -45,6 +66,13 @@ cleanup() {
         echo "Restoring original pyproject.toml..."
         cp "${PYPROJECT_FILE}.bak" "$PYPROJECT_FILE"
         rm -f "${PYPROJECT_FILE}.bak"
+
+        echo "Restoring vllm code..."
+        sed -i \
+            -e "s/importlib.metadata.version(\(['\"]\)vllm-tpu\1)/importlib.metadata.version(\1vllm\1)/" \
+            -e "s/importlib.metadata.metadata(\(['\"]\)vllm-tpu\1)/importlib.metadata.metadata(\1vllm\1)/" \
+            -e "s/version(\(['\"]\)vllm-tpu\1)/version(\1vllm\1)/" \
+            "${CHANGE_FILE_LIST[@]}"
     fi
 }
 trap cleanup EXIT HUP INT QUIT PIPE TERM # Register cleanup function to run on script exit and various signals
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
new file mode 100644
index 000000000000..e53e4ae6e529
--- /dev/null
+++ b/vllm/_aiter_ops.py
@@ -0,0 +1,983 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from collections.abc import Callable
+
+import torch
+
+import vllm.envs as envs
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
+
+
+def is_aiter_found() -> bool:
+    from importlib.util import find_spec
+
+    return find_spec("aiter") is not None
+
+
+# `find_spec` is not torch.compile compatible.
+# In cases where aiter availability might have
+# been checked in forward passes that are torch compiled.
+# we keep this global outside to not cause torch compile breaks.
+IS_AITER_FOUND = is_aiter_found()
+
+
+def if_aiter_supported(func: Callable) -> Callable:
+    """Decorator that only executes the function if
+    ROCm AITER package is supported on gfx9 archs.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        # checks the platform, device arch and aiter library existence.
+
+        if current_platform.is_rocm() and IS_AITER_FOUND:
+            from vllm.platforms.rocm import on_gfx9
+
+            if on_gfx9():
+                return func(*args, **kwargs)
+
+        return None
+
+    return wrapper
+
+
+def _rocm_aiter_group_fp8_quant_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size"
+    from aiter import QuantType, dtypes, get_hip_quant
+
+    aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
+    return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8)
+
+
+def _rocm_aiter_group_fp8_quant_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter import dtypes
+
+    M, N = x.shape
+    x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
+def _rocm_aiter_fused_moe_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+    quant_method: int = 0,
+    doweight_stage1: bool = False,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+) -> torch.Tensor:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+
+    activation = ActivationType(activation_method)
+    quant_type = QuantType(quant_method)
+
+    return fused_moe(
+        hidden_states,
+        w1,
+        w2,
+        topk_weight,
+        topk_ids,
+        expert_mask,
+        activation,
+        quant_type,
+        doweight_stage1,
+        w1_scale,
+        w2_scale,
+        a1_scale,
+        a2_scale,
+    )
+
+
+def _rocm_aiter_fused_moe_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+    quant_method: int = 0,
+    doweight_stage1: bool = False,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def _rocm_aiter_asm_moe_tkw1_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: torch.Tensor | None = None,
+    fc2_scale: torch.Tensor | None = None,
+    fc1_smooth_scale: torch.Tensor | None = None,
+    fc2_smooth_scale: torch.Tensor | None = None,
+    a16: bool = False,
+    per_tensor_quant_scale: torch.Tensor | None = None,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+) -> torch.Tensor:
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+
+    activation = ActivationType(activation_method)
+
+    return asm_moe_tkw1(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        fc1_scale=fc1_scale,
+        fc2_scale=fc2_scale,
+        fc1_smooth_scale=fc1_smooth_scale,
+        fc2_smooth_scale=fc2_smooth_scale,
+        a16=a16,
+        per_tensor_quant_scale=per_tensor_quant_scale,
+        expert_mask=expert_mask,
+        activation=activation,
+    )
+
+
+def _rocm_aiter_asm_moe_tkw1_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: torch.Tensor | None = None,
+    fc2_scale: torch.Tensor | None = None,
+    fc1_smooth_scale: torch.Tensor | None = None,
+    fc2_smooth_scale: torch.Tensor | None = None,
+    a16: bool = False,
+    per_tensor_quant_scale: torch.Tensor | None = None,
+    expert_mask: torch.Tensor | None = None,
+    activation_method: int = 0,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def _rocm_aiter_topk_softmax_impl(
+    topk_weights: torch.Tensor,
+    topk_indices: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    renormalize: bool,
+) -> None:
+    from aiter import topk_softmax
+
+    topk_softmax(
+        topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
+    )
+
+
+def _rocm_aiter_topk_softmax_fake(
+    topk_weights: torch.Tensor,
+    topk_indices: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    renormalize: bool,
+) -> None:
+    pass
+
+
+def _rocm_aiter_biased_grouped_topk_impl(
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    from aiter import biased_grouped_topk
+
+    biased_grouped_topk(
+        gating_output,
+        correction_bias,
+        topk_weights,
+        topk_ids,
+        num_expert_group,
+        topk_group,
+        need_renorm,
+        routed_scaling_factor,
+    )
+
+
+def _rocm_aiter_biased_grouped_topk_fake(
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    pass
+
+
+def _rocm_aiter_grouped_topk_impl(
+    gating_output: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    is_softmax = scoring_func == "softmax"
+    from aiter import grouped_topk
+
+    grouped_topk(
+        gating_output,
+        topk_weights,
+        topk_ids,
+        num_expert_group,
+        topk_group,
+        need_renorm,
+        is_softmax,
+        routed_scaling_factor,
+    )
+
+
+def _rocm_aiter_grouped_topk_fake(
+    gating_output: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,  # mul to topk_weights
+) -> None:
+    pass
+
+
+def _rocm_aiter_mla_decode_fwd_impl(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    qo_indptr: torch.Tensor,
+    max_seqlen_qo: int,
+    kv_indptr: torch.Tensor | None = None,
+    kv_indices: torch.Tensor | None = None,
+    kv_last_page_lens: torch.Tensor | None = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
+    from aiter.mla import mla_decode_fwd
+
+    mla_decode_fwd(
+        q,
+        kv_buffer.view(-1, 1, 1, q.shape[-1]),
+        o,
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        max_seqlen_qo,
+        sm_scale=sm_scale,
+        logit_cap=logit_cap,
+    )
+
+
+def _rocm_aiter_mla_decode_fwd_fake(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    qo_indptr: torch.Tensor,
+    max_seqlen_qo: int,
+    kv_indptr: torch.Tensor | None = None,
+    kv_indices: torch.Tensor | None = None,
+    kv_last_page_lens: torch.Tensor | None = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
+    pass
+
+
+def _rocm_aiter_gemm_a8w8_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    from aiter import gemm_a8w8_CK
+
+    # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
+    # a to be [M, K]
+    # b to be [N, K]
+    # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
+    return gemm_a8w8_CK(A, B, As, Bs, bias, output_dtype)
+
+
+def _rocm_aiter_gemm_a8w8_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
+def _rocm_aiter_gemm_a8w8_blockscale_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    from aiter import gemm_a8w8_blockscale
+
+    return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+
+
+def _rocm_aiter_gemm_a8w8_blockscale_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    m = A.shape[0]
+    n = B.shape[0]
+    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
+    return Y
+
+
+def _rocm_aiter_rms_norm_impl(
+    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+) -> torch.Tensor:
+    from aiter import rms_norm
+
+    if x.dim() > 2:
+        x_original_shape = x.shape
+        x = x.reshape(-1, x_original_shape[-1])
+        x = rms_norm(x, weight, variance_epsilon)
+        return x.reshape(x_original_shape)
+
+    return rms_norm(x, weight, variance_epsilon)
+
+
+def _rocm_aiter_rms_norm_fake(
+    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter import rmsnorm2d_fwd_with_add
+
+    residual_out = torch.empty_like(residual)
+    output = torch.empty_like(x)
+    rmsnorm2d_fwd_with_add(
+        output,  # output
+        x,  # input
+        residual,  # residual input
+        residual_out,  # residual output
+        weight,
+        variance_epsilon,
+    )
+    return output, residual_out
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty_like(x), torch.empty_like(residual)
+
+
+# Global flag to ensure ops are registered only once
+_OPS_REGISTERED = False
+
+
+class rocm_aiter_ops:
+    _AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
+    _LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
+    _RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
+    _FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
+    _MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
+    _PG_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+    _MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
+    _TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
+    _FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM
+    _FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM
+    _TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE
+    _MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
+    _TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM
+
+    @classmethod
+    @if_aiter_supported
+    def is_enabled(cls) -> bool:
+        """Verifies device specs and availability of aiter main env variable."""
+        return cls._AITER_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_linear_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._LINEAR_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_linear_fp8_enaled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls.is_linear_enabled() and current_platform.is_fp8_fnuz()
+
+    @classmethod
+    @if_aiter_supported
+    def is_rmsnorm_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._RMSNORM_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_fused_moe_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._FMOE_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_fusion_moe_shared_experts_enabled(cls) -> bool:
+        return cls.is_fused_moe_enabled() and cls._MOE_SHARED_EXPERTS_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_mla_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._MLA_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_mha_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._MHA_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_pa_attn_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._PG_ATTN_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_triton_unified_attn_enabled(cls) -> bool:
+        """ "Verifies device specs and availability of env variable."""
+        return cls._AITER_ENABLED and cls._TRITON_UNIFIED_ATTN_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_fp8bmm_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._FP8BMM_ENABLED
+
+    @classmethod
+    @if_aiter_supported
+    def is_asm_fp4_gemm_dynamic_quant_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._FP4_GEMM_DYNAMIC_QUANT_ASM
+
+    @classmethod
+    @if_aiter_supported
+    def is_triton_rotary_embed_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._TRITON_ROTARY_EMBED
+
+    @classmethod
+    @if_aiter_supported
+    def is_triton_gemm_enabled(cls) -> bool:
+        return cls._AITER_ENABLED and cls._TRITON_UNQUANT_GEMM
+
+    @staticmethod
+    @if_aiter_supported
+    def register_ops_once() -> None:
+        global _OPS_REGISTERED
+        if not _OPS_REGISTERED:
+            tags = (
+                tuple()
+                if is_torch_equal_or_newer("2.7.0")
+                else (torch.Tag.needs_fixed_stride_order,)
+            )
+
+            # register all the custom ops here
+            direct_register_custom_op(
+                op_name="rocm_aiter_group_fp8_quant",
+                op_func=_rocm_aiter_group_fp8_quant_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_group_fp8_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_asm_moe_tkw1",
+                op_func=_rocm_aiter_asm_moe_tkw1_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_asm_moe_tkw1_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_fused_moe",
+                op_func=_rocm_aiter_fused_moe_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_fused_moe_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_topk_softmax",
+                op_func=_rocm_aiter_topk_softmax_impl,
+                mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
+                fake_impl=_rocm_aiter_topk_softmax_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_biased_grouped_topk",
+                op_func=_rocm_aiter_biased_grouped_topk_impl,
+                mutates_args=["topk_weights", "topk_ids"],
+                fake_impl=_rocm_aiter_biased_grouped_topk_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_grouped_topk",
+                op_func=_rocm_aiter_grouped_topk_impl,
+                mutates_args=["topk_weights", "topk_ids"],
+                fake_impl=_rocm_aiter_grouped_topk_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_mla_decode_fwd",
+                op_func=_rocm_aiter_mla_decode_fwd_impl,
+                mutates_args=["o"],
+                fake_impl=_rocm_aiter_mla_decode_fwd_fake,
+                tags=tags,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_gemm_a8w8",
+                op_func=_rocm_aiter_gemm_a8w8_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_gemm_a8w8_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_gemm_a8w8_blockscale",
+                op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rms_norm",
+                op_func=_rocm_aiter_rms_norm_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_rms_norm_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            direct_register_custom_op(
+                op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
+                op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
+            _OPS_REGISTERED = True
+
+    @staticmethod
+    def rms_norm2d_with_add(
+        x: torch.Tensor,
+        residual: torch.Tensor,
+        weight: torch.Tensor,
+        variance_epsilon: float,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add(
+            x, residual, weight, variance_epsilon
+        )
+
+    @staticmethod
+    def rms_norm(
+        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
+
+    @staticmethod
+    def gemm_a8w8(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        bias: torch.Tensor | None = None,
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype)
+
+    @staticmethod
+    def gemm_a8w8_blockscale(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        block_size: list[int],
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_gemm_a8w8_blockscale(
+            A, B, As, Bs, output_dtype
+        )
+
+    @staticmethod
+    def fused_moe(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weight: torch.Tensor,
+        topk_ids: torch.Tensor,
+        expert_mask: torch.Tensor | None = None,
+        activation_method: int = 0,
+        quant_method: int = 0,
+        doweight_stage1: bool = False,
+        w1_scale: torch.Tensor | None = None,
+        w2_scale: torch.Tensor | None = None,
+        a1_scale: torch.Tensor | None = None,
+        a2_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_fused_moe(
+            hidden_states,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            expert_mask,
+            activation_method,
+            quant_method,
+            doweight_stage1,
+            w1_scale,
+            w2_scale,
+            a1_scale,
+            a2_scale,
+        )
+
+    @staticmethod
+    def asm_moe_tkw1(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: torch.Tensor | None = None,
+        fc2_scale: torch.Tensor | None = None,
+        fc1_smooth_scale: torch.Tensor | None = None,
+        fc2_smooth_scale: torch.Tensor | None = None,
+        a16: bool = False,
+        per_tensor_quant_scale: torch.Tensor | None = None,
+        expert_mask: torch.Tensor | None = None,
+        activation_method: int = 0,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            fc1_scale,
+            fc2_scale,
+            fc1_smooth_scale,
+            fc2_smooth_scale,
+            a16,
+            per_tensor_quant_scale,
+            expert_mask,
+            activation_method,
+        )
+
+    @staticmethod
+    def topk_softmax(
+        topk_weights: torch.Tensor,
+        topk_indices: torch.Tensor,
+        token_expert_indices: torch.Tensor,
+        gating_output: torch.Tensor,
+        renormalize: bool,
+    ) -> tuple[torch.Tensor, ...]:
+        torch.ops.vllm.rocm_aiter_topk_softmax(
+            topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
+        )
+        return topk_weights, topk_indices
+
+    @staticmethod
+    def biased_grouped_topk(
+        gating_output: torch.Tensor,
+        correction_bias: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        routed_scaling_factor: float = 1.0,
+    ) -> None:
+        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+            gating_output,
+            correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            need_renorm,
+            routed_scaling_factor,
+        )
+
+    @staticmethod
+    def grouped_topk(
+        gating_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+    ) -> None:
+        torch.ops.vllm.rocm_aiter_grouped_topk(
+            gating_output,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            need_renorm,
+            scoring_func,
+            routed_scaling_factor,
+        )
+
+    @staticmethod
+    def mla_decode_fwd(
+        q: torch.Tensor,
+        kv_buffer: torch.Tensor,
+        o: torch.Tensor,
+        sm_scale: float,
+        qo_indptr: torch.Tensor,
+        max_seqlen_qo: int,
+        kv_indptr: torch.Tensor | None = None,
+        kv_indices: torch.Tensor | None = None,
+        kv_last_page_lens: torch.Tensor | None = None,
+        logit_cap: float = 0.0,
+    ):
+        torch.ops.vllm.rocm_aiter_mla_decode_fwd(
+            q,
+            kv_buffer.view(-1, 1, 1, q.shape[-1]),
+            o,
+            qo_indptr,
+            max_seqlen_qo,
+            kv_indptr,
+            kv_indices,
+            kv_last_page_lens,
+            sm_scale=sm_scale,
+            logit_cap=logit_cap,
+        )
+
+    @staticmethod
+    def triton_fp4_gemm_dynamic_qaunt(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        out_dtype: torch.dtype | None = torch.bfloat16,
+        x_scales: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+        from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
+        if x_scales is None:
+            x_q, x_s = dynamic_mxfp4_quant(x)
+        else:
+            x_q = x
+            x_s = x_scales
+
+        y = torch.empty(
+            x_q.shape[0], weight.shape[0], device=x_q.device, dtype=out_dtype
+        )
+
+        gemm_afp4wfp4(x_q, weight, x_s, weight_scale.T, out_dtype, y)
+        return y
+
+    @staticmethod
+    def triton_rotary_embed(
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos_sin_cache: torch.Tensor,
+        head_size: int,
+        rotary_dim: int,
+        is_neox_style: bool,
+    ):
+        from aiter.ops.triton.rope import rope_cached_thd_positions_2c_fwd_inplace
+
+        num_tokens = positions.numel()
+        cos, sin = cos_sin_cache.chunk(2, dim=-1)
+        query_shape = query.shape
+        key_shape = key.shape
+        rotate_style = 0 if is_neox_style else 1
+
+        query = query.view(num_tokens, -1, head_size)
+        key = key.view(num_tokens, -1, head_size)
+        query_ = query[..., :rotary_dim]
+        key_ = key[..., :rotary_dim]
+        positions = positions.view(*query.shape[:1])
+        rope_cached_thd_positions_2c_fwd_inplace(
+            positions,
+            sin,
+            cos,
+            query_,
+            key_,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            is_nope_first=False,
+        )
+        query = query.view(query_shape)
+        key = key.view(key_shape)
+
+    @staticmethod
+    def triton_fp8_bmm(
+        X: torch.Tensor,
+        WQ: torch.Tensor,
+        w_scale: torch.Tensor,
+        group_size: int = 128,
+        bias: torch.Tensor | None = None,
+        dtype: torch.dtype | None = torch.bfloat16,
+        splitK: int | None = None,
+        YQ: torch.Tensor | None = None,
+        transpose_bm: bool | None = False,
+        config: dict | None = None,
+    ) -> torch.Tensor:
+        # ruff: noqa: E501 # isort: skip
+        from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (
+            batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm,
+        )
+
+        return aiter_triton_fp8_bmm(
+            X,
+            WQ,
+            w_scale,
+            group_size=group_size,
+            bias=bias,
+            dtype=dtype,
+            splitK=splitK,
+            YQ=YQ,
+            transpose_bm=transpose_bm,
+            config=config,
+        )
+
+    @staticmethod
+    def triton_gemm_a8w8_blockscale(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+        block_size: list[int],
+        output_dtype: torch.dtype = torch.float16,
+    ) -> torch.Tensor:
+        from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
+
+        return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
+
+    @staticmethod
+    def group_fp8_quant(
+        input_2d: torch.Tensor,
+        group_size: int = 128,
+    ) -> tuple[torch.Tensor, ...]:
+        assert group_size == 128, "Group size must be 128"
+        return torch.ops.vllm.rocm_aiter_group_fp8_quant(input_2d, group_size)
+
+    @staticmethod
+    def is_triton_gemm_w8a8_tuned(n: int, k: int) -> bool:
+        return (n, k) in [
+            (1024, 8192),
+            (2112, 7168),
+            (3072, 1536),
+            (32768, 8192),
+            (4096, 7168),
+            (4608, 7168),
+            (512, 7168),
+            (7168, 2048),
+            (7168, 256),
+            (8192, 1024),
+            (8192, 32768),
+        ]
+
+    @staticmethod
+    def shuffle_weight(
+        self, tensor: torch.Tensor, layout: tuple[int, int] = (16, 16)
+    ) -> torch.Tensor:
+        from aiter.ops.shuffle import shuffle_weight
+
+        return shuffle_weight(tensor, layout=layout)
+
+    @staticmethod
+    def shuffle_weights(
+        *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Applies shuffle_weight function from AITER to each
+        input tensor and returns them.
+
+        Rearranges (shuffles) the input tensor/s
+        into a specified block layout for optimized computation.
+
+        Args:
+            *tensors: Variable number of torch.Tensor objects.
+            layout: A pair of integers specifying the block sizes used to divide
+                the tensors during shuffling. Default is (16, 16).
+
+        Returns:
+        A Tuple of shuffled tensors.
+        """
+        from aiter.ops.shuffle import shuffle_weight
+
+        return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
+
+
+rocm_aiter_ops.register_ops_once()
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 36aab503dee7..096266c9764e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -329,6 +329,7 @@ def rms_norm(
     out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float
 ) -> None:
     # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input
+    # If removed, also need to remove contiguous in MatcherRMSNorm
     input_contiguous = input.contiguous()
     torch.ops._C.rms_norm(out, input_contiguous, weight, epsilon)
 
@@ -339,6 +340,34 @@ def fused_add_rms_norm(
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
+def fused_qk_norm_rope(
+    qkv: torch.Tensor,
+    num_heads_q: int,
+    num_heads_k: int,
+    num_heads_v: int,
+    head_dim: int,
+    eps: float,
+    q_weight: torch.Tensor,
+    k_weight: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+    position_ids: torch.Tensor,
+) -> None:
+    torch.ops._C.fused_qk_norm_rope(
+        qkv,
+        num_heads_q,
+        num_heads_k,
+        num_heads_v,
+        head_dim,
+        eps,
+        q_weight,
+        k_weight,
+        cos_sin_cache,
+        is_neox,
+        position_ids,
+    )
+
+
 def apply_repetition_penalties_torch(
     logits: torch.Tensor,
     prompt_mask: torch.Tensor,
@@ -1145,13 +1174,50 @@ def gptq_marlin_repack(
     return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits)
 
 
-# gptq_marlin
+if hasattr(torch.ops._C, "gptq_marlin_repack"):
+
+    @register_fake("_C::gptq_marlin_repack")
+    def _gptq_marlin_repack_fake(
+        b_q_weight: torch.Tensor,
+        perm: torch.Tensor,
+        size_k: torch.SymInt,
+        size_n: torch.SymInt,
+        num_bits: int,
+    ) -> torch.Tensor:
+        pack_factor = 32 // num_bits
+        marlin_tile_size = 16
+        return torch.empty(
+            (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
+            dtype=b_q_weight.dtype,
+            device=b_q_weight.device,
+        )
+
+
+# awq_marlin
 def awq_marlin_repack(
     b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int
 ) -> torch.Tensor:
     return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
 
 
+if hasattr(torch.ops._C, "awq_marlin_repack"):
+
+    @register_fake("_C::awq_marlin_repack")
+    def _awq_marlin_repack_fake(
+        b_q_weight: torch.Tensor,
+        size_k: torch.SymInt,
+        size_n: torch.SymInt,
+        num_bits: int,
+    ) -> torch.Tensor:
+        pack_factor = 32 // num_bits
+        marlin_tile_size = 16
+        return torch.empty(
+            (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
+            dtype=b_q_weight.dtype,
+            device=b_q_weight.device,
+        )
+
+
 def gptq_marlin_moe_repack(
     b_q_weight: torch.Tensor,
     perm: torch.Tensor,
@@ -2554,6 +2620,88 @@ def onednn_scaled_mm(
     return output
 
 
+def cpu_attn_get_scheduler_metadata(
+    num_reqs: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_dim: int,
+    seq_lens: torch.Tensor,
+    dtype: torch.dtype,
+    query_start_loc: torch.Tensor,
+    causal: bool,
+    sliding_window_size: int,
+    isa: str,
+    enable_kv_split: bool,
+) -> torch.Tensor:
+    sheduler_metadata = torch.ops._C.get_scheduler_metadata(
+        num_reqs,
+        num_heads,
+        num_kv_heads,
+        head_dim,
+        seq_lens,
+        dtype,
+        query_start_loc,
+        causal,
+        sliding_window_size,
+        isa,
+        enable_kv_split,
+    )
+    return sheduler_metadata
+
+
+def cpu_attn_reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    isa: str,
+) -> None:
+    torch.ops._C.cpu_attn_reshape_and_cache(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        isa,
+    )
+
+
+def cpu_attention_with_kv_cache(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    output: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    seq_lens: torch.Tensor,
+    scale: float,
+    causal: bool,
+    alibi_slopes: torch.Tensor | None,
+    sliding_window: tuple[int, int],
+    block_table: torch.Tensor,
+    softcap: float,
+    scheduler_metadata: torch.Tensor,
+    s_aux: torch.Tensor | None,
+) -> None:
+    torch.ops._C.cpu_attention_with_kv_cache(
+        query,
+        key_cache,
+        value_cache,
+        output,
+        query_start_loc,
+        seq_lens,
+        scale,
+        causal,
+        alibi_slopes,
+        sliding_window[0],
+        sliding_window[1],
+        block_table,
+        softcap,
+        scheduler_metadata,
+        s_aux,
+    )
+
+
 if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
 
     @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index b54eaf4e2872..9275d70fd86a 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -2,13 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from typing import Generic, Protocol, TypeVar
+from typing import TYPE_CHECKING, ClassVar, Generic, Protocol, TypeVar, get_args
 
 import torch
 
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
 
+if TYPE_CHECKING:
+    from vllm.config.cache import CacheDType
+    from vllm.platforms.interface import DeviceCapability
+    from vllm.v1.attention.backends.utils import KVCacheLayoutType
+
 
 class AttentionType:
     """
@@ -40,6 +45,9 @@ class AttentionBackend(ABC):
     # calling the custom op. When piecewise cudagraph is enabled, this
     # makes sure the output tensor is allocated inside the cudagraph.
     accept_output_buffer: bool = False
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)]
+    supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"]
 
     @staticmethod
     @abstractmethod
@@ -51,10 +59,6 @@ def get_name() -> str:
     def get_impl_cls() -> type["AttentionImpl"]:
         raise NotImplementedError
 
-    @classmethod
-    def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
-        return cls.get_impl_cls().get_supported_kernel_block_size()
-
     @staticmethod
     @abstractmethod
     def get_builder_cls():  # -> Type["AttentionMetadataBuilder"]:
@@ -79,6 +83,150 @@ def get_kv_cache_stride_order() -> tuple[int, ...]:
     def full_cls_name(cls) -> tuple[str, str]:
         return (cls.__module__, cls.__qualname__)
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return []
+
+    @classmethod
+    def supports_head_size(cls, head_size: int) -> bool:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        return (not supported_head_sizes) or head_size in supported_head_sizes
+
+    @classmethod
+    def supports_dtype(cls, dtype: torch.dtype) -> bool:
+        return dtype in cls.supported_dtypes
+
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: "CacheDType | None") -> bool:
+        if kv_cache_dtype is None:
+            return True
+        return (not cls.supported_kv_cache_dtypes) or (
+            kv_cache_dtype in cls.supported_kv_cache_dtypes
+        )
+
+    @classmethod
+    def supports_block_size(cls, block_size: int | None) -> bool:
+        from vllm.config.cache import BlockSize
+
+        if block_size is None:
+            return True
+
+        valid_sizes = get_args(BlockSize)
+        if block_size not in valid_sizes:
+            return False
+
+        if not cls.supported_kernel_block_sizes:
+            return True
+
+        for supported_size in cls.supported_kernel_block_sizes:
+            is_multiple_of = (
+                isinstance(supported_size, MultipleOf)
+                and block_size % supported_size.base == 0
+            )
+            is_int_equal = (
+                isinstance(supported_size, int) and block_size == supported_size
+            )
+            if is_multiple_of or is_int_equal:
+                return True
+        return False
+
+    @classmethod
+    def is_mla(cls) -> bool:
+        return False
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        return False
+
+    @classmethod
+    def is_sparse(cls) -> bool:
+        return False
+
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """Check if backend supports a given attention type.
+
+        By default, only supports decoder attention.
+        Backends should override this to support other attention types.
+        """
+        from vllm.attention import AttentionType
+
+        return attn_type == AttentionType.DECODER
+
+    @classmethod
+    def supports_compute_capability(cls, capability: "DeviceCapability") -> bool:
+        return True
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+    ) -> str | None:
+        return None
+
+    @classmethod
+    def validate_configuration(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+        attn_type: str,
+    ) -> list[str]:
+        invalid_reasons = []
+        if not cls.supports_head_size(head_size):
+            invalid_reasons.append("head_size not supported")
+        if not cls.supports_dtype(dtype):
+            invalid_reasons.append("dtype not supported")
+        if not cls.supports_kv_cache_dtype(kv_cache_dtype):
+            invalid_reasons.append("kv_cache_dtype not supported")
+        if not cls.supports_block_size(block_size):
+            invalid_reasons.append("block_size not supported")
+        if use_mla != cls.is_mla():
+            if use_mla:
+                invalid_reasons.append("MLA not supported")
+            else:
+                invalid_reasons.append("non-MLA not supported")
+        if has_sink and not cls.supports_sink():
+            invalid_reasons.append("sink setting not supported")
+        if use_sparse != cls.is_sparse():
+            if use_sparse:
+                invalid_reasons.append("sparse not supported")
+            else:
+                invalid_reasons.append("non-sparse not supported")
+        if not cls.supports_compute_capability(device_capability):
+            invalid_reasons.append("compute capability not supported")
+        if not cls.supports_attn_type(attn_type):
+            invalid_reasons.append(f"attention type {attn_type} not supported")
+        combination_reason = cls.supports_combination(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla,
+            has_sink,
+            use_sparse,
+            device_capability,
+        )
+        if combination_reason is not None:
+            invalid_reasons.append(combination_reason)
+        return invalid_reasons
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
+        return None
+
 
 class AttentionMetadata:
     pass
@@ -151,11 +299,6 @@ def __init__(
     ) -> None:
         raise NotImplementedError
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        # TODO: implement this function for all backends.
-        return [MultipleOf(1)]
-
     @abstractmethod
     def forward(
         self,
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 05d0159d0861..f07a6059be37 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -3,108 +3,193 @@
 """Attention backend registry"""
 
 import enum
+from collections.abc import Callable
+from typing import TYPE_CHECKING, cast
 
+from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
-class _Backend(enum.Enum):
-    FLASH_ATTN = enum.auto()
-    TRITON_ATTN = enum.auto()
-    XFORMERS = enum.auto()
-    ROCM_ATTN = enum.auto()
-    ROCM_AITER_MLA = enum.auto()
-    ROCM_AITER_FA = enum.auto()  # used for ViT attn backend
-    TORCH_SDPA = enum.auto()
-    FLASHINFER = enum.auto()
-    FLASHINFER_MLA = enum.auto()
-    TRITON_MLA = enum.auto()
-    CUTLASS_MLA = enum.auto()
-    FLASHMLA = enum.auto()
-    FLASHMLA_SPARSE = enum.auto()
-    FLASH_ATTN_MLA = enum.auto()
-    PALLAS = enum.auto()
-    IPEX = enum.auto()
-    NO_ATTENTION = enum.auto()
-    FLEX_ATTENTION = enum.auto()
-    TREE_ATTN = enum.auto()
-    ROCM_AITER_UNIFIED_ATTN = enum.auto()
-
-
-BACKEND_MAP = {
-    _Backend.FLASH_ATTN: "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend",  # noqa: E501
-    _Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",  # noqa: E501
-    _Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",  # noqa: E501
-    _Backend.ROCM_ATTN: "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend",  # noqa: E501
-    _Backend.ROCM_AITER_MLA: "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend",  # noqa: E501
-    _Backend.ROCM_AITER_FA: "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend",  # noqa: E501
-    _Backend.TORCH_SDPA: "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend",  # noqa: E501
-    _Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",  # noqa: E501
-    _Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",  # noqa: E501
-    _Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",  # noqa: E501
-    _Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",  # noqa: E501
-    _Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",  # noqa: E501
-    _Backend.FLASHMLA_SPARSE: "vllm.v1.attention.backends.mla.flashmla_sparse.FlashMLASparseBackend",  # noqa: E501
-    _Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",  # noqa: E501
-    _Backend.PALLAS: "vllm.v1.attention.backends.pallas.PallasAttentionBackend",  # noqa: E501
-    _Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",  # noqa: E501
-    _Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",  # noqa: E501
-    _Backend.ROCM_AITER_UNIFIED_ATTN: "vllm.v1.attention.backends.rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend",  # noqa: E501
-}
-
-
-def register_attn_backend(backend: _Backend, class_path: str | None = None):
-    """
-    Decorator: register a custom attention backend into BACKEND_MAPPING.
-    - If class_path is provided, use it.
-    - Otherwise, auto-generate from the class object.
-    Validation: only checks if 'backend' is a valid _Backend enum member.
-    Overwriting existing mappings is allowed. This enables other hardware
-    platforms to plug in custom out-of-tree backends.
-    """
-    if not isinstance(backend, _Backend):
-        raise ValueError(f"{backend} is not a valid _Backend enum value.")
+logger = init_logger(__name__)
 
-    def decorator(cls):
-        path = class_path or f"{cls.__module__}.{cls.__qualname__}"
-        BACKEND_MAP[backend] = path
-        return cls
 
-    return decorator
+class _AttentionBackendEnumMeta(enum.EnumMeta):
+    """Metaclass for AttentionBackendEnum to provide better error messages."""
 
+    def __getitem__(cls, name: str):
+        """Get backend by name with helpful error messages."""
+        try:
+            return super().__getitem__(name)
+        except KeyError:
+            members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values()
+            valid_backends = ", ".join(m.name for m in members)
+            raise ValueError(
+                f"Unknown attention backend: '{name}'. "
+                f"Valid options are: {valid_backends}"
+            ) from None
 
-def backend_to_class_str(backend: _Backend) -> str:
-    """Get the backend class string
 
-    Args:
-        backend: The backend enum value
+class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
+    """Enumeration of all supported attention backends.
 
-    Returns:
-        The backend class string
-    """
-    return BACKEND_MAP[backend]
+    The enum value is the default class path, but this can be overridden
+    at runtime using register_backend().
 
+    To get the actual backend class (respecting overrides), use:
+        backend.get_class()
+    """
 
-def backend_to_class(backend: _Backend) -> type:
-    """Get the backend class.
+    FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+    TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
+    XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
+    ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
+    ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
+    ROCM_AITER_FA = (
+        "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
+    )
+    TORCH_SDPA = ""  # this tag is only used for ViT
+    FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+    FLASHINFER_MLA = (
+        "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
+    )
+    TRITON_MLA = "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
+    CUTLASS_MLA = "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
+    FLASHMLA = "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend"
+    FLASHMLA_SPARSE = (
+        "vllm.v1.attention.backends.mla.flashmla_sparse.FlashMLASparseBackend"
+    )
+    FLASH_ATTN_MLA = "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
+    PALLAS = "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+    IPEX = "vllm.v1.attention.backends.ipex.IpexAttentionBackend"
+    NO_ATTENTION = "vllm.v1.attention.backends.no_attention.NoAttentionBackend"
+    FLEX_ATTENTION = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
+    TREE_ATTN = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"
+    ROCM_AITER_UNIFIED_ATTN = (
+        "vllm.v1.attention.backends.rocm_aiter_unified_attn."
+        "RocmAiterUnifiedAttentionBackend"
+    )
+    CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
+    # Placeholder for third-party/custom backends - must be registered before use
+    CUSTOM = ""
+
+    def get_path(self, include_classname: bool = True) -> str:
+        """Get the class path for this backend (respects overrides).
+
+        Returns:
+            The fully qualified class path string
+
+        Raises:
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        path = _OVERRIDES.get(self, self.value)
+        if not path:
+            raise ValueError(
+                f"Backend {self.name} must be registered before use. "
+                f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')"
+            )
+        if not include_classname:
+            path = path.rsplit(".", 1)[0]
+        return path
+
+    def get_class(self) -> "type[AttentionBackend]":
+        """Get the backend class (respects overrides).
+
+        Returns:
+            The backend class
+
+        Raises:
+            ImportError: If the backend class cannot be imported
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        return resolve_obj_by_qualname(self.get_path())
+
+    def is_overridden(self) -> bool:
+        """Check if this backend has been overridden.
+
+        Returns:
+            True if the backend has a registered override
+        """
+        return self in _OVERRIDES
+
+    def clear_override(self) -> None:
+        """Clear any override for this backend, reverting to the default."""
+        _OVERRIDES.pop(self, None)
+
+
+_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+
+
+def register_backend(
+    backend: AttentionBackendEnum, class_path: str | None = None
+) -> Callable[[type], type]:
+    """Register or override a backend implementation.
 
     Args:
-        backend: The backend enum value
+        backend: The AttentionBackendEnum member to register
+        class_path: Optional class path. If not provided and used as
+            decorator, will be auto-generated from the class.
 
     Returns:
-        The backend class
+        Decorator function if class_path is None, otherwise a no-op
+
+    Examples:
+        # Override an existing backend
+        @register_backend(AttentionBackendEnum.FLASH_ATTN)
+        class MyCustomFlashAttn:
+            ...
+
+        # Register a custom third-party backend
+        @register_backend(AttentionBackendEnum.CUSTOM)
+        class MyCustomBackend:
+            ...
+
+        # Direct registration
+        register_backend(
+            AttentionBackendEnum.CUSTOM,
+            "my.module.MyCustomBackend"
+        )
     """
-    backend_class_name = backend_to_class_str(backend)
-    return resolve_obj_by_qualname(backend_class_name)
 
+    def decorator(cls: type) -> type:
+        _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"
+        return cls
 
-def backend_name_to_enum(backend_name: str) -> _Backend | None:
-    """
-    Convert a string backend name to a _Backend enum value.
+    if class_path is not None:
+        _OVERRIDES[backend] = class_path
+        return lambda x: x
 
-    Returns:
-        _Backend: enum value if backend_name is a valid in-tree type
-        None: otherwise it's an invalid in-tree type or an out-of-tree platform
-              is loaded.
+    return decorator
+
+
+# Backwards compatibility alias for plugins
+class _BackendMeta(type):
+    """Metaclass to provide deprecation warnings when accessing _Backend."""
+
+    def __getattribute__(cls, name: str):
+        if name not in ("__class__", "__mro__", "__name__"):
+            logger.warning(
+                "_Backend has been renamed to AttentionBackendEnum. "
+                "Please update your code to use AttentionBackendEnum instead. "
+                "_Backend will be removed in a future release."
+            )
+        return getattr(AttentionBackendEnum, name)
+
+    def __getitem__(cls, name: str):
+        logger.warning(
+            "_Backend has been renamed to AttentionBackendEnum. "
+            "Please update your code to use AttentionBackendEnum instead. "
+            "_Backend will be removed in a future release."
+        )
+        return AttentionBackendEnum[name]
+
+
+class _Backend(metaclass=_BackendMeta):
+    """Deprecated: Use AttentionBackendEnum instead.
+
+    This class is provided for backwards compatibility with plugins
+    and will be removed in a future release.
     """
-    assert backend_name is not None
-    return _Backend[backend_name] if backend_name in _Backend.__members__ else None
+
+    pass
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 17e025155a43..a8e796a1eab6 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -12,17 +12,13 @@
 import vllm.envs as envs
 from vllm.attention import AttentionType
 from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
-from vllm.attention.backends.registry import _Backend, backend_name_to_enum
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
+from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.config.multimodal import MultiModalConfig
 from vllm.config.vllm import VllmConfig
-from vllm.distributed.kv_transfer import (
-    get_kv_transfer_group,
-    has_kv_transfer_group,
-    is_v1_kv_transfer_group,
-)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
@@ -99,40 +95,44 @@ def check_upstream_fa_availability(dtype: torch.dtype):
 
 
 def maybe_get_vit_flash_attn_backend(
-    attn_backend: _Backend,
+    attn_backend: AttentionBackendEnum,
     use_upstream_fa: bool,
-    attn_backend_override: _Backend | None = None,
-) -> tuple[_Backend, Callable | None]:
+    attn_backend_override: AttentionBackendEnum | None = None,
+) -> tuple[AttentionBackendEnum, Callable | None]:
     if current_platform.is_rocm():
         if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
-            attn_backend = _Backend.ROCM_AITER_FA
+            attn_backend = AttentionBackendEnum.ROCM_AITER_FA
 
         elif (
             check_upstream_fa_availability(torch.get_default_dtype())
             and on_gfx9()
             and attn_backend_override is None
         ):
-            attn_backend = _Backend.FLASH_ATTN
+            attn_backend = AttentionBackendEnum.FLASH_ATTN
             use_upstream_fa = True
         else:
-            return _Backend.TORCH_SDPA, None
+            return AttentionBackendEnum.TORCH_SDPA, None
 
     elif current_platform.is_cuda():
-        if attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            attn_backend = _Backend.FLASH_ATTN
+            attn_backend = AttentionBackendEnum.FLASH_ATTN
             use_upstream_fa = True
     elif current_platform.is_xpu():
-        assert attn_backend == _Backend.FLASH_ATTN, (
+        assert attn_backend == AttentionBackendEnum.FLASH_ATTN, (
             "XPU platform only supports FLASH_ATTN as vision attention backend."
         )
         use_upstream_fa = False
     else:
-        return _Backend.TORCH_SDPA, None
+        return AttentionBackendEnum.TORCH_SDPA, None
 
-    if attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}:
-        if attn_backend == _Backend.ROCM_AITER_FA:
+    if attn_backend in {
+        AttentionBackendEnum.FLASH_ATTN,
+        AttentionBackendEnum.ROCM_AITER_FA,
+    }:
+        if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
             from aiter import flash_attn_varlen_func
         else:
             if use_upstream_fa:
@@ -291,6 +291,7 @@ def __init__(
                 block_size,
                 use_mla=False,
                 has_sink=self.has_sink,
+                attn_type=attn_type,
             )
         else:
             self.attn_backend = attn_backend
@@ -309,7 +310,8 @@ def __init__(
             kv_sharing_target_layer_name,
             **extra_impl_args,
         )
-        self.backend = backend_name_to_enum(self.attn_backend.get_name())
+        backend_name = self.attn_backend.get_name()
+        self.backend = AttentionBackendEnum.__members__.get(backend_name)
         self.dtype = dtype
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
@@ -530,13 +532,13 @@ def __init__(
             backend
             if backend
             in {
-                _Backend.TORCH_SDPA,
-                _Backend.XFORMERS,
-                _Backend.PALLAS,
-                _Backend.ROCM_AITER_FA,
-                _Backend.FLASH_ATTN,
+                AttentionBackendEnum.TORCH_SDPA,
+                AttentionBackendEnum.XFORMERS,
+                AttentionBackendEnum.PALLAS,
+                AttentionBackendEnum.ROCM_AITER_FA,
+                AttentionBackendEnum.FLASH_ATTN,
             }
-            else _Backend.TORCH_SDPA
+            else AttentionBackendEnum.TORCH_SDPA
         )
 
         self.attn_backend, self._flash_attn_varlen_func = (
@@ -547,17 +549,23 @@ def __init__(
             )
         )
 
-        if self.attn_backend == _Backend.XFORMERS and not check_xformers_availability():
-            self.attn_backend = _Backend.TORCH_SDPA
+        if (
+            self.attn_backend == AttentionBackendEnum.XFORMERS
+            and not check_xformers_availability()
+        ):
+            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
 
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
         # this condition is just to make sure that the
         # use_upstream_fa in the log is correct
-        if current_platform.is_rocm() and self.attn_backend == _Backend.FLASH_ATTN:
+        if (
+            current_platform.is_rocm()
+            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+        ):
             use_upstream_fa = True
 
         logger.info_once(
@@ -606,17 +614,17 @@ def forward(
                 max_seqlen_k=kv_len,
                 softmax_scale=self.scale,
             )
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
 
             out = xops.memory_efficient_attention_forward(
                 query, key, value, scale=self.scale
             )
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             query, key, value = (x.transpose(1, 2) for x in (query, key, value))
             out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
             out = out.transpose(1, 2)
-        elif self.attn_backend == _Backend.PALLAS:
+        elif self.attn_backend == AttentionBackendEnum.PALLAS:
             query, key, value = (x.transpose(1, 2) for x in (query, key, value))
             from torch_xla.experimental.custom_kernel import flash_attention
 
@@ -745,6 +753,9 @@ def forward(
         k_pe: torch.Tensor,
         output_shape: torch.Size | None = None,
     ) -> torch.Tensor:
+        if self.calculate_kv_scales:
+            torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name)
+
         if self.use_direct_call:
             forward_context: ForwardContext = get_forward_context()
             attn_metadata = forward_context.attn_metadata
@@ -752,12 +763,6 @@ def forward(
                 attn_metadata = attn_metadata[self.layer_name]
             self_kv_cache = self.kv_cache[forward_context.virtual_engine]
 
-            # Mirror Attention.forward scale calculation path
-            if self.calculate_kv_scales and getattr(
-                attn_metadata, "enable_kv_scales_calculation", False
-            ):
-                self.calc_kv_scales(q, kv_c_normed, k_pe)
-
             if self.attn_backend.accept_output_buffer:
                 output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
                 self.impl.forward(
@@ -786,14 +791,6 @@ def forward(
                 )
                 return output
             else:
-                # We can still access forward context to check calculation flag
-                if self.calculate_kv_scales:
-                    forward_context = get_forward_context()
-                    attn_metadata = forward_context.attn_metadata
-                    if isinstance(attn_metadata, dict):
-                        attn_metadata = attn_metadata[self.layer_name]
-                    if getattr(attn_metadata, "enable_kv_scales_calculation", False):
-                        self.calc_kv_scales(q, kv_c_normed, k_pe)
                 return torch.ops.vllm.unified_mla_attention(
                     q,
                     kv_c_normed,
@@ -843,37 +840,6 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
         )
 
 
-def wait_for_kv_layer_from_connector(layer_name: str):
-    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
-        return
-
-    connector = get_kv_transfer_group()
-
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if attn_metadata is None:
-        return
-    assert isinstance(attn_metadata, dict)
-    connector.wait_for_layer_load(layer_name)
-
-
-def maybe_save_kv_layer_to_connector(
-    layer_name: str,
-    kv_cache_layer: list[torch.Tensor],
-):
-    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
-        return
-
-    connector = get_kv_transfer_group()
-
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if attn_metadata is None:
-        return
-    assert isinstance(attn_metadata, dict)
-    connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata[layer_name])
-
-
 def maybe_calc_kv_scales(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -881,17 +847,13 @@ def maybe_calc_kv_scales(
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
+    self = forward_context.no_compile_layers[layer_name]
 
-    if attn_metadata is None or not getattr(
-        attn_metadata, "enable_kv_scales_calculation", False
-    ):
+    # Only calculate if the layer's calculate_kv_scales flag is True
+    # This flag gets set to False after the first forward pass
+    if not self.calculate_kv_scales:
         return
 
-    self = forward_context.no_compile_layers[layer_name]
     self.calc_kv_scales(query, key, value)
 
 
@@ -912,23 +874,46 @@ def maybe_calc_kv_scales_fake(
 )
 
 
-def unified_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
+def get_attention_context(
     layer_name: str,
-) -> torch.Tensor:
-    wait_for_kv_layer_from_connector(layer_name)
+) -> tuple[dict | object | None, Attention | MLAAttention, torch.Tensor]:
+    """Extract attention context for a given layer.
+
+    This helper function extracts the attention metadata, attention layer
+    instance, and KV cache tensor for a specific layer.
 
+    Args:
+        layer_name: The name/identifier of the attention layer.
+
+    Returns:
+        A tuple containing:
+        - attn_metadata: Attention metadata for this specific layer, or None if
+            no metadata available
+        - attn_layer: The attention layer instance (Attention or MLAAttention)
+        - kv_cache: The KV cache tensor for current virtual engine
+
+        Note: attn_metadata may be None, but attn_layer and kv_cache are always
+        extracted from the forward context.
+    """
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     if isinstance(attn_metadata, dict):
         attn_metadata = attn_metadata[layer_name]
-    self = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name]
+    kv_cache = attn_layer.kv_cache[forward_context.virtual_engine]
+    return attn_metadata, attn_layer, kv_cache
+
+
+@maybe_transfer_kv_layer
+def unified_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
     output = self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
 
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
     return output
 
 
@@ -948,6 +933,7 @@ def unified_attention_fake(
 )
 
 
+@maybe_transfer_kv_layer
 def unified_attention_with_output(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -957,13 +943,7 @@ def unified_attention_with_output(
     output_scale: torch.Tensor | None = None,
     output_block_scale: torch.Tensor | None = None,
 ) -> None:
-    wait_for_kv_layer_from_connector(layer_name)
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-    self = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
     self.impl.forward(
         self,
         query,
@@ -976,8 +956,6 @@ def unified_attention_with_output(
         output_block_scale=output_block_scale,
     )
 
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
-
 
 def unified_attention_with_output_fake(
     query: torch.Tensor,
@@ -999,23 +977,16 @@ def unified_attention_with_output_fake(
 )
 
 
+@maybe_transfer_kv_layer
 def unified_mla_attention(
     q: torch.Tensor,
     kv_c_normed: torch.Tensor,
     k_pe: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
-    wait_for_kv_layer_from_connector(layer_name)
-
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-    self: MLAAttention = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
     output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata)
 
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
     return output
 
 
@@ -1037,6 +1008,7 @@ def unified_mla_attention_fake(
 )
 
 
+@maybe_transfer_kv_layer
 def unified_mla_attention_with_output(
     q: torch.Tensor,
     kv_c_normed: torch.Tensor,
@@ -1046,13 +1018,7 @@ def unified_mla_attention_with_output(
     output_scale: torch.Tensor | None = None,
     output_block_scale: torch.Tensor | None = None,
 ) -> None:
-    wait_for_kv_layer_from_connector(layer_name)
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-    self: MLAAttention = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    attn_metadata, self, kv_cache = get_attention_context(layer_name)
     self.impl.forward(
         self,
         q,
@@ -1065,8 +1031,6 @@ def unified_mla_attention_with_output(
         output_block_scale=output_block_scale,
     )
 
-    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
-
 
 def unified_mla_attention_with_output_fake(
     q: torch.Tensor,
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 5532ce80d7f1..48fcc6fa736b 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
-from typing import ClassVar
 
 import torch
 
@@ -12,11 +11,16 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
+    AttentionMetadataBuilder,
     CommonAttentionMetadata,
     make_local_attention_virtual_batches,
     subclass_attention_backend,
 )
-from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, KVCacheSpec
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    ChunkedLocalAttentionSpec,
+    KVCacheSpec,
+)
 
 from ..layer import Attention
 
@@ -30,9 +34,18 @@ def create_chunked_local_attention_backend(
     prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
 
     underlying_builder = underlying_attn_backend.get_builder_cls()
+    assert issubclass(underlying_builder, AttentionMetadataBuilder)
 
     class ChunkedLocalAttentionBuilder(underlying_builder):  # type: ignore
-        cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+        @classmethod
+        def get_cudagraph_support(
+            cls: type["AttentionMetadataBuilder"],
+            vllm_config: VllmConfig,
+            kv_cache_spec: AttentionSpec,
+        ) -> AttentionCGSupport:
+            # Explicit override in case the underlying builder specialized this getter.
+            # @override omitted only because of mypy limitation due to type variable.
+            return AttentionCGSupport.NEVER
 
         def build(
             self,
diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/attention/layers/encoder_only_attention.py
index 4929bbf5efc7..5e99c9901003 100644
--- a/vllm/attention/layers/encoder_only_attention.py
+++ b/vllm/attention/layers/encoder_only_attention.py
@@ -74,7 +74,11 @@ def __init__(
             block_size = 16
 
         underlying_attn_backend = get_attn_backend(
-            head_size, dtype, kv_cache_dtype, block_size
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=AttentionType.ENCODER_ONLY,
         )
 
         attn_backend = create_encoder_only_attention_backend(underlying_attn_backend)
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index 75fdcb8f48b2..2cbb5c91cc3b 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -195,7 +195,6 @@ def cp_lse_ag_out_rs(
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
     out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
-    assert out.is_contiguous()
     out = cp_group.reduce_scatter(out, dim=1)
 
     if return_lse:
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index addf1d9dea73..f101d5c4a927 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -335,216 +335,6 @@ def _fwd_kernel(
     return
 
 
-@triton.jit
-def _fwd_kernel_flash_attn_v2(
-    Q,
-    K,
-    V,
-    K_cache,
-    V_cache,
-    B_Loc,
-    sm_scale,
-    B_Start_Loc,
-    B_Seqlen,
-    B_Ctxlen,
-    block_size,
-    x,
-    Out,
-    stride_b_loc_b,
-    stride_b_loc_s,
-    stride_qbs,
-    stride_qh,
-    stride_qd,
-    stride_kbs,
-    stride_kh,
-    stride_kd,
-    stride_vbs,
-    stride_vh,
-    stride_vd,
-    stride_obs,
-    stride_oh,
-    stride_od,
-    stride_k_cache_bs,
-    stride_k_cache_h,
-    stride_k_cache_d,
-    stride_k_cache_bl,
-    stride_k_cache_x,
-    stride_v_cache_bs,
-    stride_v_cache_h,
-    stride_v_cache_d,
-    stride_v_cache_bl,
-    num_queries_per_kv: int,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    cur_batch = tl.program_id(0)
-    cur_head = tl.program_id(1)
-    start_m = tl.program_id(2)
-
-    cur_kv_head = cur_head // num_queries_per_kv
-
-    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-
-    block_start_loc = BLOCK_M * start_m
-
-    # initialize offsets
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    off_q = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
-        + cur_head * stride_qh
-        + offs_d[None, :] * stride_qd
-    )
-
-    q = tl.load(
-        Q + off_q,
-        mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-        other=0.0,
-    )
-
-    # # initialize pointer to m and l
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-
-    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        bn = tl.load(
-            B_Loc
-            + cur_batch * stride_b_loc_b
-            + ((start_n + offs_n) // block_size) * stride_b_loc_s,
-            mask=(start_n + offs_n) < cur_batch_ctx_len,
-            other=0,
-        ).to(tl.int64)
-        off_k = (
-            bn[None, :] * stride_k_cache_bs
-            + cur_kv_head * stride_k_cache_h
-            + (offs_d[:, None] // x) * stride_k_cache_d
-            + ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl
-            + (offs_d[:, None] % x) * stride_k_cache_x
-        )
-        off_v = (
-            bn[:, None] * stride_v_cache_bs
-            + cur_kv_head * stride_v_cache_h
-            + offs_d[None, :] * stride_v_cache_d
-            + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl
-        )
-        k = tl.load(
-            K_cache + off_k,
-            mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-            other=0.0,
-        )
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        qk = tl.where(
-            (start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, float("-inf")
-        )
-        qk *= sm_scale
-
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        m_i_new = tl.maximum(m_i, m_ij)
-        p = tl.math.exp(qk - m_i_new[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-
-        alpha = tl.math.exp(m_i - m_i_new)
-        l_i_new = alpha * l_i + l_ij
-        # -- update output accumulator --
-        # scale p
-        # scale acc
-        acc_scale = alpha
-        # acc_scale = l_i / l_i_new * alpha
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(
-            V_cache + off_v,
-            mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-            other=0.0,
-        )
-
-        p = p.to(v.dtype)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    off_k = (
-        offs_n[None, :] * stride_kbs
-        + cur_kv_head * stride_kh
-        + offs_d[:, None] * stride_kd
-    )
-    off_v = (
-        offs_n[:, None] * stride_vbs
-        + cur_kv_head * stride_vh
-        + offs_d[None, :] * stride_vd
-    )
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-
-    block_mask = tl.where(block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        k = tl.load(
-            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
-            mask=(start_n + offs_n[None, :]) < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0,
-        )
-
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        qk *= sm_scale
-        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
-
-        # -- compute m_ij, p, l_ij
-        m_ij = tl.max(qk, 1)
-        m_i_new = tl.maximum(m_i, m_ij)
-        p = tl.math.exp(qk - m_i_new[:, None])
-        l_ij = tl.sum(p, 1)
-        # -- update m_i and l_i
-
-        alpha = tl.math.exp(m_i - m_i_new)
-        l_i_new = alpha * l_i + l_ij
-        # -- update output accumulator --
-        # scale p
-        # scale acc
-        acc_scale = alpha
-        # acc_scale = l_i / l_i_new * alpha
-        acc = acc * acc_scale[:, None]
-        # update acc
-        v = tl.load(
-            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
-            mask=(start_n + offs_n[:, None]) < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0,
-        )
-
-        p = p.to(v.dtype)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    # acc /= l_i[:, None]
-    # initialize pointers to output
-    off_o = (
-        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
-        + cur_head * stride_oh
-        + offs_d[None, :] * stride_od
-    )
-    out_ptrs = Out + off_o
-    tl.store(
-        out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len
-    )
-    return
-
-
 @triton.jit
 def _fwd_kernel_alibi(
     Q,
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
deleted file mode 100644
index 6308f63cc4e7..000000000000
--- a/vllm/attention/ops/rocm_aiter_mla.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-
-import torch
-
-from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
-
-
-def get_aiter_mla_metadata(
-    max_batch_size: int, block_size: int, max_block_per_batch: int, device: torch.device
-) -> tuple[torch.Tensor, ...]:
-    paged_kv_indices = torch.zeros(
-        max_batch_size * max_block_per_batch, dtype=torch.int32, device=device
-    )
-    paged_kv_indptr = torch.zeros(max_batch_size + 1, dtype=torch.int32, device=device)
-    paged_kv_last_page_lens = torch.full(
-        (max_batch_size,), block_size, dtype=torch.int32
-    )
-    qo_indptr = torch.zeros(max_batch_size + 1, dtype=torch.int, device=device)
-    return paged_kv_indices, paged_kv_indptr, paged_kv_last_page_lens, qo_indptr
-
-
-def aiter_mla_decode_fwd(
-    q: torch.Tensor,
-    kv_buffer: torch.Tensor,
-    o: torch.Tensor,
-    sm_scale: float,
-    qo_indptr: torch.Tensor,
-    max_seqlen_qo: int,
-    kv_indptr: torch.Tensor | None = None,
-    kv_indices: torch.Tensor | None = None,
-    kv_last_page_lens: torch.Tensor | None = None,
-    logit_cap: float = 0.0,
-):
-    torch.ops.vllm.rocm_aiter_mla_decode_fwd(
-        q,
-        kv_buffer.view(-1, 1, 1, q.shape[-1]),
-        o,
-        qo_indptr,
-        max_seqlen_qo,
-        kv_indptr,
-        kv_indices,
-        kv_last_page_lens,
-        sm_scale=sm_scale,
-        logit_cap=logit_cap,
-    )
-
-
-def mla_decode_fwd_impl(
-    q: torch.Tensor,
-    kv_buffer: torch.Tensor,
-    o: torch.Tensor,
-    qo_indptr: torch.Tensor,
-    max_seqlen_qo: int,
-    kv_indptr: torch.Tensor | None = None,
-    kv_indices: torch.Tensor | None = None,
-    kv_last_page_lens: torch.Tensor | None = None,
-    sm_scale: float = 1.0,
-    logit_cap: float = 0.0,
-) -> None:
-    from aiter.mla import mla_decode_fwd
-
-    mla_decode_fwd(
-        q,
-        kv_buffer.view(-1, 1, 1, q.shape[-1]),
-        o,
-        qo_indptr,
-        kv_indptr,
-        kv_indices,
-        kv_last_page_lens,
-        max_seqlen_qo,
-        sm_scale=sm_scale,
-        logit_cap=logit_cap,
-    )
-
-
-def mla_decode_fwd_fake(
-    q: torch.Tensor,
-    kv_buffer: torch.Tensor,
-    o: torch.Tensor,
-    qo_indptr: torch.Tensor,
-    max_seqlen_qo: int,
-    kv_indptr: torch.Tensor | None = None,
-    kv_indices: torch.Tensor | None = None,
-    kv_last_page_lens: torch.Tensor | None = None,
-    sm_scale: float = 1.0,
-    logit_cap: float = 0.0,
-) -> None:
-    pass
-
-
-if current_platform.is_rocm():
-    if is_torch_equal_or_newer("2.7.0"):
-        tags = ()
-    else:
-        tags = ((torch.Tag.needs_fixed_stride_order,),)
-    direct_register_custom_op(
-        op_name="rocm_aiter_mla_decode_fwd",
-        op_func=mla_decode_fwd_impl,
-        mutates_args=["o"],
-        fake_impl=mla_decode_fwd_fake,
-        tags=tags,
-    )
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
deleted file mode 100644
index c0ab35d07b1f..000000000000
--- a/vllm/attention/ops/triton_flash_attention.py
+++ /dev/null
@@ -1,932 +0,0 @@
-#!/usr/bin/env python
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Fused Attention
-===============
-
-This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
-(https://tridao.me/publications/flash2/flash2.pdf)
-Credits: OpenAI kernel team, AMD ML Frameworks Triton team
-
-Features supported:
-
-1) Fwd with causal masking
-2) Any sequence lengths without padding (currently fwd kernel only)
-3) Support for different sequence lengths for q and k
-4) Nested tensor API currently does not support dropout or bias.
-
-Not currently supported:
-
-1) Non power of two head dims
-
-"""
-
-import torch
-
-from vllm.platforms import current_platform
-from vllm.triton_utils import tl, triton
-
-# Avoid misleading ROCm warning.
-if current_platform.is_rocm():
-    from vllm.platforms.rocm import on_gfx1x
-else:
-    on_gfx1x = lambda *args, **kwargs: False
-
-torch_dtype: tl.constexpr = torch.float16
-
-
-@triton.jit
-def cdiv_fn(x, y):
-    return (x + y - 1) // y
-
-
-@triton.jit
-def max_fn(x, y):
-    return tl.math.max(x, y)
-
-
-@triton.jit
-def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
-    ms = tl.arange(0, m)
-    ns = tl.arange(0, n)
-    return philox_offset + ms[:, None] * stride + ns[None, :]
-
-
-@triton.jit
-def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_offsets = dropout_offsets(
-        philox_seed, philox_offset, dropout_p, m, n, stride
-    ).to(tl.uint32)
-    # TODO: use tl.randint for better performance
-    return tl.rand(philox_seed, rng_offsets)
-
-
-@triton.jit
-def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
-    rng_keep = rng_output > dropout_p
-    return rng_keep
-
-
-@triton.jit
-def load_fn(block_ptr, first, second, pad):
-    if first and second:
-        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
-    elif first:
-        tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad)
-    elif second:
-        tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad)
-    else:
-        tensor = tl.load(block_ptr)
-    return tensor
-
-
-@triton.jit
-def _attn_fwd_inner(
-    acc,
-    l_i,
-    m_i,
-    q,
-    K_block_ptr,
-    V_block_ptr,
-    start_m,
-    actual_seqlen_k,
-    dropout_p,
-    philox_seed,
-    batch_philox_offset,
-    encoded_softmax_block_ptr,
-    block_min,
-    block_max,
-    offs_n_causal,
-    masked_blocks,
-    n_extra_tokens,
-    bias_ptr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    OFFS_M: tl.constexpr,
-    OFFS_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    MASK_STEPS: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    PADDED_HEAD: tl.constexpr,
-    USE_FP8: tl.constexpr,
-    qk_scale,
-    p_descale,
-):
-    # loop over k, v, and update accumulator
-    for start_n in range(block_min, block_max, BLOCK_N):
-        # For padded blocks, we will overrun the tensor size if
-        # we load all BLOCK_N. For others, the blocks are all within range.
-        k = load_fn(
-            K_block_ptr,
-            PADDED_HEAD,
-            MASK_STEPS and (n_extra_tokens != 0),
-            "zero",
-        )
-        if PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        # We start from end of seqlen_k so only the first iteration would need
-        # to be checked for padding if it is not a multiple of block_n
-        # TODO: This can be optimized to only be true for the padded block.
-        if MASK_STEPS:  # noqa: SIM102
-            # If this is the last block / iteration, we want to
-            # mask if the sequence length is not a multiple of block size
-            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
-            # if not is_modulo_mn. last step might get wasted but that is okay.
-            # check if this masking works for that case.
-            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
-                boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32)
-                size_n = start_n + OFFS_N[None, :]
-                mask = size_n < boundary_m[:, None]
-                qk = tl.where(mask, qk, float("-inf"))
-        if IS_CAUSAL:
-            causal_boundary = start_n + offs_n_causal
-            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
-            qk = tl.where(causal_mask, qk, float("-inf"))
-        # -- compute qk ----
-        qk += tl.dot(q, k)
-        if USE_FP8:
-            qk *= qk_scale
-        if bias_ptr is not None:
-            bias = load_fn(
-                bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero"
-            )
-            # While bias is added after multiplying qk with sm_scale, our
-            # optimization to use 2^x instead of e^x results in an additional
-            # scale factor of log2(e) which we must also multiply the bias with.
-            qk += bias * 1.44269504089
-        m_ij = tl.maximum(m_i, tl.max(qk, 1))
-        qk = qk - m_ij[:, None]
-        p = tl.math.exp2(qk)
-
-        # CAVEAT: Must update l_ij before applying dropout
-        l_ij = tl.sum(p, 1)
-        if ENABLE_DROPOUT:
-            philox_offset = (
-                batch_philox_offset
-                + start_m * BLOCK_M * actual_seqlen_k
-                + start_n
-                - BLOCK_N
-            )
-            keep = dropout_mask(
-                philox_seed,
-                philox_offset,
-                dropout_p,
-                BLOCK_M,
-                BLOCK_N,
-                actual_seqlen_k,
-            )
-            if RETURN_ENCODED_SOFTMAX:
-                tl.store(
-                    encoded_softmax_block_ptr,
-                    tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty),
-                )
-            p = tl.where(keep, p, 0.0)
-        elif RETURN_ENCODED_SOFTMAX:
-            tl.store(
-                encoded_softmax_block_ptr,
-                p.to(encoded_softmax_block_ptr.type.element_ty),
-            )
-        # -- update output accumulator --
-        alpha = tl.math.exp2(m_i - m_ij)
-        acc = acc * alpha[:, None]
-        if not PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
-        # -- update m_i and l_i
-        l_i = l_i * alpha + l_ij
-        # update m_i and l_i
-        m_i = m_ij
-
-        if USE_FP8:
-            p *= p_descale
-
-        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
-
-        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
-        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(
-                encoded_softmax_block_ptr, (0, BLOCK_N)
-            )
-    return acc, l_i, m_i
-
-
-def get_cdna_autotune_configs():
-    return [
-        triton.Config(
-            {"BLOCK_M": 256, "BLOCK_N": 64, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_M": 256, "BLOCK_N": 128, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 1, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "PRE_LOAD_V": True},
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 64, "waves_per_eu": 3, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 64, "waves_per_eu": 4, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=8,
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "waves_per_eu": 4, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=8,
-        ),
-        # TODO: This config fails with head_size not pow2 with data mismatches.
-        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
-        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
-        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
-        # triton.Config(
-        #     {
-        #         "BLOCK_M": 16,
-        #         "BLOCK_N": 16,
-        #         "waves_per_eu": 1,
-        #         "PRE_LOAD_V": False,
-        #     },
-        #     num_stages=1,
-        #     num_warps=4,
-        # ),
-    ], ["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL", "USE_FP8"]
-
-
-def get_rdna_autotune_configs():
-    return [
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "waves_per_eu": 4, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 16, "waves_per_eu": 4, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=2,
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 16, "waves_per_eu": 2, "PRE_LOAD_V": False},
-            num_stages=1,
-            num_warps=2,
-        ),
-        # Fails in AccelerateAMDMatmul (Triton) assert when using FP8:
-        # triton.Config(
-        #     {
-        #         'BLOCK_M': 16,
-        #         'BLOCK_N': 16,
-        #         'waves_per_eu': 4,
-        #         'PRE_LOAD_V': False
-        #     },
-        #     num_stages=1,
-        #     num_warps=2),
-        # triton.Config(
-        #     {
-        #         'BLOCK_M': 16,
-        #         'BLOCK_N': 16,
-        #         'waves_per_eu': 2,
-        #         'PRE_LOAD_V': False
-        #     },
-        #     num_stages=1,
-        #     num_warps=2),
-        # # Fall-back config.
-        # triton.Config(
-        #     {
-        #         'BLOCK_M': 16,
-        #         'BLOCK_N': 16,
-        #         'waves_per_eu': 1,
-        #         'PRE_LOAD_V': False
-        #     },
-        #     num_stages=1,
-        #     num_warps=2),
-    ], ["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL", "USE_FP8"]
-
-
-def get_autotune_configs():
-    if on_gfx1x():
-        return get_rdna_autotune_configs()
-    else:
-        return get_cdna_autotune_configs()
-
-
-autotune_configs, autotune_keys = get_autotune_configs()
-
-float8_info = torch.finfo(current_platform.fp8_dtype())
-
-
-@triton.autotune(
-    configs=autotune_configs,
-    key=autotune_keys,
-)
-@triton.jit
-def attn_fwd(
-    Q,
-    K,
-    V,
-    bias,
-    sm_scale,
-    q_scale,
-    k_scale,
-    v_scale,
-    p_scale,
-    p_descale,
-    o_descale,
-    L,
-    Out,
-    stride_qz: tl.int64,
-    stride_qh: tl.int64,
-    stride_qm: tl.int64,
-    stride_qk: tl.int64,
-    stride_kz: tl.int64,
-    stride_kh: tl.int64,
-    stride_kn: tl.int64,
-    stride_kk: tl.int64,
-    stride_vz: tl.int64,
-    stride_vh: tl.int64,
-    stride_vk: tl.int64,
-    stride_vn: tl.int64,
-    stride_oz: tl.int64,
-    stride_oh: tl.int64,
-    stride_om: tl.int64,
-    stride_on: tl.int64,
-    stride_bz: tl.int64,
-    stride_bh: tl.int64,
-    stride_bm: tl.int64,
-    stride_bn: tl.int64,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    dropout_p,
-    philox_seed,
-    philox_offset_base,
-    encoded_softmax,
-    HQ: tl.constexpr,
-    HK: tl.constexpr,
-    ACTUAL_BLOCK_DMODEL: tl.constexpr,
-    MAX_SEQLENS_Q: tl.constexpr,
-    MAX_SEQLENS_K: tl.constexpr,
-    VARLEN: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    USE_FP8: tl.constexpr,
-    USE_FP8_OUT: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    FP8_MIN: tl.constexpr = float8_info.min,
-    FP8_MAX: tl.constexpr = float8_info.max,
-):
-    start_m = tl.program_id(0)
-    off_h_q = tl.program_id(1)
-    off_z = tl.program_id(2)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    if VARLEN:
-        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
-        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
-        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
-        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
-        # small for all start_m so for those we return early.
-        if start_m * BLOCK_M > seqlen_q:
-            return
-        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
-        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
-        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
-    else:
-        cu_seqlens_q_start = 0
-        cu_seqlens_k_start = 0
-        seqlen_q = MAX_SEQLENS_Q
-        seqlen_k = MAX_SEQLENS_K
-
-    # Now we compute whether we need to exit early due to causal masking.
-    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
-    # are completely masked, resulting in 0s written to the output, and
-    # inf written to LSE. We don't need to do any GEMMs in this case.
-    # This block of code determines what N is, and if this WG is operating
-    # on those M rows.
-    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
-    if IS_CAUSAL:
-        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
-        # If seqlen_q != seqlen_k, attn scores are rectangular which means
-        # the causal mask boundary is bottom right aligned, and ends at either
-        # the top edge (seqlen_q < seqlen_k) or left edge.
-        # This captures the decrease in n_blocks if we have a rectangular attn
-        # matrix
-        n_blocks_seqlen = cdiv_fn(
-            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N
-        )
-        # This is what adjusts the block_max for the current WG, only
-        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
-        n_blocks = min(n_blocks, n_blocks_seqlen)
-        # If we have no blocks after adjusting for seqlen deltas, this WG is
-        # part of the blocks that are all 0. We exit early.
-        if n_blocks <= 0:
-            o_offset = (
-                off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
-            )
-            O_block_ptr = tl.make_block_ptr(
-                base=Out + o_offset,
-                shape=(seqlen_q, BLOCK_DMODEL),
-                strides=(stride_om, stride_on),
-                offsets=(start_m * BLOCK_M, 0),
-                block_shape=(BLOCK_M, BLOCK_DMODEL),
-                order=(1, 0),
-            )
-            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
-            # We still need to write 0s to the result
-            # tl.store(O_block_ptr,
-            # acc.to(Out.type.element_ty), boundary_check=(0,1))
-            # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
-            #          + offs_m
-            # We store inf to LSE, not -inf because in the bwd pass,
-            # we subtract this
-            # from qk which makes it -inf, such that exp(qk - inf) = 0
-            # for these masked blocks.
-            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
-            # tl.store(l_ptrs, l)
-            # TODO: Should dropout and return encoded softmax be handled here?
-            return
-
-    # If MQA / GQA, set the K and V head offsets appropriately.
-    GROUP_SIZE: tl.constexpr = HQ // HK
-    off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
-
-    n_extra_tokens = 0
-    if seqlen_k < BLOCK_N:
-        n_extra_tokens = BLOCK_N - seqlen_k
-    elif seqlen_k % BLOCK_N:
-        n_extra_tokens = seqlen_k % BLOCK_N
-    padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
-
-    # Compute pointers for all the tensors used in this kernel.
-    q_offset = off_z * stride_qz + off_h_q * stride_qh + cu_seqlens_q_start * stride_qm
-    Q_block_ptr = tl.make_block_ptr(
-        base=Q + q_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_qm, stride_qk),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn
-    K_block_ptr = tl.make_block_ptr(
-        base=K + k_offset,
-        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
-        strides=(stride_kk, stride_kn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_DMODEL, BLOCK_N),
-        order=(0, 1),
-    )
-    v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk
-    V_block_ptr = tl.make_block_ptr(
-        base=V + v_offset,
-        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_vk, stride_vn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_N, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    if BIAS_TYPE != 0:
-        bias_ptr = tl.make_block_ptr(
-            base=bias + off_h_q * stride_bh,
-            shape=(seqlen_q, seqlen_k),
-            strides=(stride_bm, stride_bn),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        bias_ptr = None
-    if ENABLE_DROPOUT:
-        batch_philox_offset = (
-            philox_offset_base + (off_z * HQ + off_h_q) * seqlen_q * seqlen_k
-        )
-    else:
-        batch_philox_offset = 0
-    # We can ask to return the dropout mask without actually doing any dropout.
-    # In this case, we return an invalid pointer so indicate the mask is not i
-    # valid.
-    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
-    if RETURN_ENCODED_SOFTMAX:
-        encoded_softmax_block_ptr = tl.make_block_ptr(
-            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
-            shape=(seqlen_q, seqlen_k),
-            strides=(seqlen_k, 1),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        encoded_softmax_block_ptr = 0
-    # initialize pointer to m and l
-    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
-    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
-    # have native e^x support in HW.
-    qk_scale = sm_scale * 1.44269504089
-    # Q is loaded once at the beginning and shared by all N blocks.
-    q = load_fn(Q_block_ptr, True, padded_head, "zero")
-    if not USE_FP8:
-        q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
-        acc_scale = 1.0
-    else:
-        qk_scale *= q_scale * k_scale
-        acc_scale = p_scale * v_scale
-
-    # Here we compute how many full and masked blocks we have.
-    padded_block_k = n_extra_tokens != 0
-    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
-    if IS_CAUSAL:
-        # There are always at least BLOCK_M // BLOCK_N masked blocks.
-        # Additionally there might be one more due to dissimilar seqlens.
-        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
-    else:
-        # Padding on Q does not need to be masked in the FA loop.
-        masked_blocks = padded_block_k
-    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
-    # block. In this case we might exceed n_blocks so pick the min.
-    masked_blocks = min(masked_blocks, n_blocks)
-    n_full_blocks = n_blocks - masked_blocks
-    block_min = 0
-    block_max = n_blocks * BLOCK_N
-    # Compute for full blocks. Here we set causal to false regardless of its
-    # value because there is no masking. Similarly we do not need padding.
-    if n_full_blocks > 0:
-        block_max = (n_blocks - masked_blocks) * BLOCK_N
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
-            block_min,
-            block_max,
-            0,
-            0,
-            0,
-            bias_ptr,
-            # IS_CAUSAL, ....
-            False,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            False,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            padded_head,
-            USE_FP8,
-            qk_scale,
-            p_descale,
-        )
-        block_min = block_max
-        block_max = n_blocks * BLOCK_N
-
-    tl.debug_barrier()
-    # Remaining blocks, if any, are full / not masked.
-    if masked_blocks > 0:
-        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
-        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
-        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(
-                encoded_softmax_block_ptr, (0, n_full_blocks)
-            )
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            block_min,
-            block_max,
-            offs_n_causal,
-            masked_blocks,
-            n_extra_tokens,
-            bias_ptr,
-            IS_CAUSAL,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            True,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            padded_head,
-            USE_FP8,
-            qk_scale,
-            p_descale,
-        )
-    # epilogue
-
-    if USE_FP8:
-        acc *= acc_scale
-    acc = acc / l_i[:, None]
-    if ENABLE_DROPOUT:
-        acc = acc / (1 - dropout_p)
-    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
-    # then we have one block with a row of all NaNs which come from computing
-    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
-    # and store 0s where there are NaNs as these rows should've been zeroed out.
-    end_m_idx = (start_m + 1) * BLOCK_M
-    start_m_idx = start_m * BLOCK_M
-    causal_start_idx = seqlen_q - seqlen_k
-    if USE_FP8_OUT:
-        acc *= o_descale
-        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
-    acc = acc.to(Out.type.element_ty)
-    if IS_CAUSAL:  # noqa: SIM102
-        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
-            out_mask_boundary = tl.full(
-                (BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32
-            )
-            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-            out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :]
-            z = tl.zeros((1,), tl.float32)
-            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
-    # write back LSE
-    # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
-    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
-    # few rows. This is only true for the last M block. For others,
-    # overflow_size will be -ve
-    # overflow_size = end_m_idx - seqlen_q
-    # if overflow_size > 0:
-    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
-    #    # This is a > check because mask being 0 blocks the store.
-    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
-    # else:
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
-
-    # write back O
-    o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
-    O_block_ptr = tl.make_block_ptr(
-        base=Out + o_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_om, stride_on),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    # Need boundary check on this to make sure the padding from the
-    # Q and KV tensors in both dims are not part of what we store back.
-    # TODO: Do the boundary check optionally.
-    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
-
-
-def check_args(
-    q,
-    k,
-    v,
-    o,
-    varlen=True,
-    max_seqlens=None,
-    cu_seqlens_q=None,
-    cu_seqlens_k=None,
-):
-    assert q.dim() == k.dim() and q.dim() == v.dim()
-    if varlen:
-        assert q.dim() == 3
-        total_q, nheads_q, head_size = q.shape
-        total_k, nheads_k, _ = k.shape
-        assert cu_seqlens_q is not None
-        assert cu_seqlens_k is not None
-        assert len(cu_seqlens_q) == len(cu_seqlens_k)
-    else:
-        assert q.dim() == 4
-        batch, nheads_q, seqlen_q, head_size = q.shape
-        _, nheads_k, seqlen_k, _ = k.shape
-        assert max_seqlens > 0
-    assert k.shape == v.shape
-    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
-    # TODO: Change assert if we support qkl f8 and v f16
-    assert q.dtype == k.dtype and q.dtype == v.dtype
-    assert head_size <= 256
-    assert o.shape == q.shape
-    assert (nheads_q % nheads_k) == 0
-
-
-class _attention(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        k,
-        v,
-        o,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlens_q,
-        max_seqlens_k,
-        causal=False,
-        sm_scale=1.0,
-        bias=None,
-        fp8_scales=None,
-        fp8_out_scale=None,
-    ):
-        if fp8_scales is not None:
-            use_fp8 = True
-            (q_scale, k_scale, v_scale, p_scale) = fp8_scales
-            float8 = current_platform.fp8_dtype()
-
-            def check_and_convert(t, scale):
-                if t.dtype != float8:
-                    descale = 1.0 / scale
-                    ts = (t * descale).clamp(min=float8_info.min, max=float8_info.max)
-                    return ts.to(float8)
-                else:
-                    return t
-
-            q = check_and_convert(q, q_scale)
-            k = check_and_convert(k, k_scale)
-            v = check_and_convert(v, v_scale)
-        else:
-            use_fp8 = False
-            q_scale = k_scale = v_scale = p_scale = 1.0
-
-        if o is None:
-            o = torch.empty_like(q, dtype=v.dtype)
-
-        check_args(
-            q,
-            k,
-            v,
-            o,
-            varlen=True,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-        )
-        if True:  # varlen
-            total_q, nheads_q, head_size = q.shape
-            total_k, nheads_k, _ = k.shape
-            batch = len(cu_seqlens_q) - 1
-            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
-            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
-            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
-            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
-        else:
-            batch, seqlen_q, nheads_q, head_size = q.shape
-            _, seqlen_k, nheads_k, _ = k.shape
-            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
-            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
-            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
-            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
-
-        # Get closest power of 2 over or equal to 32.
-        unpadded_head_dims = {32, 64, 128, 256}
-        if head_size not in unpadded_head_dims:
-            padded_d_model = None
-            for i in unpadded_head_dims:
-                if i > head_size:
-                    padded_d_model = i
-                    break
-            assert padded_d_model is not None
-        else:
-            padded_d_model = head_size
-
-        grid = lambda META: (
-            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
-            nheads_q,
-            batch,
-        )
-
-        encoded_softmax = None
-
-        # Seed the RNG so we get reproducible results for testing.
-        philox_seed = 0x1BF52
-        philox_offset = 0x1D4B42
-
-        if bias is not None:
-            bias_strides = (
-                bias.stride(0),
-                bias.stride(1),
-                bias.stride(2),
-                bias.stride(3),
-            )
-        else:
-            bias_strides = (0, 0, 0, 0)
-
-        p_descale = 1.0 / p_scale
-        o_descale = 1.0 / fp8_out_scale.item() if fp8_out_scale is not None else 1.0
-
-        arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q
-        arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k
-
-        attn_fwd[grid](
-            q,
-            k,
-            v,
-            bias,
-            sm_scale,
-            q_scale,
-            k_scale,
-            v_scale,
-            p_scale,
-            p_descale,
-            o_descale,
-            None,
-            o,
-            *q_strides,
-            *k_strides,
-            *v_strides,
-            *o_strides,
-            *bias_strides,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            dropout_p=0.0,
-            philox_seed=philox_seed,
-            philox_offset_base=philox_offset,
-            encoded_softmax=encoded_softmax,
-            HQ=nheads_q,
-            HK=nheads_k,
-            ACTUAL_BLOCK_DMODEL=head_size,
-            MAX_SEQLENS_Q=arg_max_seqlens_q,
-            MAX_SEQLENS_K=arg_max_seqlens_k,
-            IS_CAUSAL=causal,
-            VARLEN=True,
-            BLOCK_DMODEL=padded_d_model,
-            BIAS_TYPE=0 if bias is None else 1,
-            ENABLE_DROPOUT=False,
-            RETURN_ENCODED_SOFTMAX=False,
-            USE_FP8=use_fp8,
-            USE_FP8_OUT=fp8_out_scale is not None,
-        )
-
-        ctx.grid = grid
-        ctx.sm_scale = sm_scale
-        ctx.BLOCK_DMODEL = head_size
-        ctx.causal = causal
-        ctx.dropout_p = 0.0
-        ctx.philox_seed = philox_seed
-        ctx.philox_offset = philox_offset
-        ctx.encoded_softmax = encoded_softmax
-        ctx.return_encoded_softmax = False
-        return o, encoded_softmax
-
-
-triton_attention = _attention.apply
diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/attention/ops/triton_reshape_and_cache_flash.py
index bbcd560ad56e..5d2ba154ae01 100644
--- a/vllm/attention/ops/triton_reshape_and_cache_flash.py
+++ b/vllm/attention/ops/triton_reshape_and_cache_flash.py
@@ -97,7 +97,6 @@ def triton_reshape_and_cache_flash(
     k_scale: torch.Tensor,  # float32
     v_scale: torch.Tensor,  # float32
 ):
-    num_tokens = key.shape[0]
     num_heads = key.shape[1]
     head_size = key.shape[2]
     block_size = key_cache.shape[1]
@@ -155,7 +154,10 @@ def triton_reshape_and_cache_flash(
 
     # TODO(ngl): maybe replace with static launch grid to avoid overhead if
     #   using cudagraphs
-    grid = lambda meta: (int(num_tokens), triton.cdiv(n, meta["TILE_SIZE"]))
+    grid = lambda meta: (
+        slot_mapping.shape[0],
+        triton.cdiv(n, meta["TILE_SIZE"]),
+    )
 
     reshape_and_cache_kernel_flash[grid](
         key_ptr=key,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 9c26a8d40eda..1a092db9ce37 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,17 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import inspect
 import os
 from collections.abc import Generator
 from contextlib import contextmanager
-from dataclasses import dataclass
 from functools import cache
+from typing import cast, get_args
 
 import torch
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.registry import _Backend, backend_name_to_enum
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.utils import STR_BACKEND_ENV_VAR
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -19,18 +21,18 @@
 logger = init_logger(__name__)
 
 
-def get_env_variable_attn_backend() -> _Backend | None:
+def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
     """
     Get the backend override specified by the vLLM attention
     backend environment variable, if one is specified.
 
     Returns:
 
-    * _Backend enum value if an override is specified
+    * AttentionBackendEnum value if an override is specified
     * None otherwise
     """
     backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
-    return None if backend_name is None else backend_name_to_enum(backend_name)
+    return None if backend_name is None else AttentionBackendEnum[backend_name]
 
 
 # Global state allows a particular choice of backend
@@ -40,10 +42,10 @@ def get_env_variable_attn_backend() -> _Backend | None:
 #
 # THIS SELECTION TAKES PRECEDENCE OVER THE
 # VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
-forced_attn_backend: _Backend | None = None
+forced_attn_backend: AttentionBackendEnum | None = None
 
 
-def global_force_attn_backend(attn_backend: _Backend | None) -> None:
+def global_force_attn_backend(attn_backend: AttentionBackendEnum | None) -> None:
     """
     Force all attention operations to use a specified backend.
 
@@ -58,7 +60,7 @@ def global_force_attn_backend(attn_backend: _Backend | None) -> None:
     forced_attn_backend = attn_backend
 
 
-def get_global_forced_attn_backend() -> _Backend | None:
+def get_global_forced_attn_backend() -> AttentionBackendEnum | None:
     """
     Get the currently-forced choice of attention backend,
     or None if auto-selection is currently enabled.
@@ -66,82 +68,34 @@ def get_global_forced_attn_backend() -> _Backend | None:
     return forced_attn_backend
 
 
-@dataclass(frozen=True)
-class _IsSupported:
-    can_import: bool
-    head_size: bool
-    dtype: bool
-
-    def __bool__(self) -> bool:
-        return self.can_import and self.head_size and self.dtype
-
-
-def is_attn_backend_supported(
-    attn_backend: str | type[AttentionBackend],
-    head_size: int,
-    dtype: torch.dtype,
-    *,
-    allow_import_error: bool = True,
-) -> _IsSupported:
-    if isinstance(attn_backend, str):
-        try:
-            attn_backend = resolve_obj_by_qualname(attn_backend)
-        except ImportError:
-            if not allow_import_error:
-                raise
-
-            return _IsSupported(can_import=False, head_size=False, dtype=False)
-
-    assert isinstance(attn_backend, type)
-
-    # TODO: Update the interface once V0 is removed
-    if get_supported_head_sizes := getattr(
-        attn_backend, "get_supported_head_sizes", None
-    ):
-        is_head_size_supported = head_size in get_supported_head_sizes()
-    elif validate_head_size := getattr(attn_backend, "validate_head_size", None):
-        try:
-            validate_head_size(head_size)
-            is_head_size_supported = True
-        except Exception:
-            is_head_size_supported = False
-    else:
-        raise NotImplementedError(
-            f"{attn_backend.__name__} does not support head size validation"
-        )
-
-    if get_supported_dtypes := getattr(attn_backend, "get_supported_dtypes", None):
-        is_dtype_supported = dtype in get_supported_dtypes()
-    else:
-        raise NotImplementedError(
-            f"{attn_backend.__name__} does not support dtype validation"
-        )
-
-    return _IsSupported(
-        can_import=True,
-        head_size=is_head_size_supported,
-        dtype=is_dtype_supported,
-    )
-
-
 def get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
     kv_cache_dtype: str | None,
-    block_size: int,
+    block_size: int | None,
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
+
+    if kv_cache_dtype is not None:
+        valid_cache_dtypes = get_args(CacheDType)
+        assert kv_cache_dtype in valid_cache_dtypes, (
+            f"Invalid kv_cache_dtype: {kv_cache_dtype}. "
+            f"Valid values are: {valid_cache_dtypes}"
+        )
+
     return _cached_get_attn_backend(
         head_size=head_size,
         dtype=dtype,
-        kv_cache_dtype=kv_cache_dtype,
+        kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype),
         block_size=block_size,
         use_mla=use_mla,
         has_sink=has_sink,
         use_sparse=use_sparse,
+        attn_type=attn_type,
     )
 
 
@@ -149,11 +103,12 @@ def get_attn_backend(
 def _cached_get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
-    kv_cache_dtype: str | None,
-    block_size: int,
+    kv_cache_dtype: CacheDType | None,
+    block_size: int | None,
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     # Check whether a particular choice of backend was
     # previously forced.
@@ -161,7 +116,9 @@ def _cached_get_attn_backend(
     # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
     # ENVIRONMENT VARIABLE.
     selected_backend = None
-    backend_by_global_setting: _Backend | None = get_global_forced_attn_backend()
+    backend_by_global_setting: AttentionBackendEnum | None = (
+        get_global_forced_attn_backend()
+    )
     if backend_by_global_setting is not None:
         selected_backend = backend_by_global_setting
     else:
@@ -177,37 +134,72 @@ def _cached_get_attn_backend(
                     STR_BACKEND_ENV_VAR,
                 )
                 backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1")
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-            if selected_backend is None:
+            try:
+                selected_backend = AttentionBackendEnum[backend_by_env_var]
+            except KeyError as e:
                 raise ValueError(
-                    f"Invalid attention backend: '{backend_by_env_var}'. "
-                    f"Valid backends are: {list(_Backend.__members__.keys())}"
-                )
+                    f"Invalid attention backend: '{backend_by_env_var}'. Valid "
+                    f"backends are: {list(AttentionBackendEnum.__members__.keys())}"
+                ) from e
 
     # get device-specific attn_backend
     from vllm.platforms import current_platform
 
-    attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend,
-        head_size,
-        dtype,
-        kv_cache_dtype,
-        block_size,
-        True,
-        use_mla,
-        has_sink,
-        use_sparse,
-    )
+    sig = inspect.signature(current_platform.get_attn_backend_cls)
+    if "use_v1" in sig.parameters:
+        logger.warning_once(
+            "use_v1 parameter for get_attn_backend_cls is deprecated and will "
+            "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please "
+            "remove it from your plugin code."
+        )
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            True,  # use_v1
+            use_mla,
+            has_sink,
+            use_sparse,
+            attn_type,
+        )
+    else:
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla,
+            has_sink,
+            use_sparse,
+            attn_type,
+        )
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}"
         )
-    return resolve_obj_by_qualname(attention_cls)
+    backend = resolve_obj_by_qualname(attention_cls)
+
+    # Adjust kv cache layout if the selected backend requires a specific one
+    required_layout = backend.get_required_kv_cache_layout()
+    if required_layout is not None:
+        from vllm.v1.attention.backends.utils import set_kv_cache_layout
+
+        set_kv_cache_layout(required_layout)
+        logger.info(
+            "Using %s KV cache layout for %s backend.",
+            required_layout,
+            backend.get_name(),
+        )
+
+    return backend
 
 
 @contextmanager
 def global_force_attn_backend_context_manager(
-    attn_backend: _Backend,
+    attn_backend: AttentionBackendEnum,
 ) -> Generator[None, None, None]:
     """
     Globally force a vLLM attention backend override within a
diff --git a/vllm/attention/utils/kv_transfer_utils.py b/vllm/attention/utils/kv_transfer_utils.py
new file mode 100644
index 000000000000..210be55feb2f
--- /dev/null
+++ b/vllm/attention/utils/kv_transfer_utils.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import inspect
+from collections.abc import Callable
+from functools import wraps
+
+from vllm.distributed.kv_transfer import (
+    get_kv_transfer_group,
+    has_kv_transfer_group,
+    is_v1_kv_transfer_group,
+)
+
+
+def maybe_transfer_kv_layer(func: Callable) -> Callable:
+    """Decorator that handles KV layer transfer prior and after execution of
+    an attention layer, if enabled. Otherwise, the wrapper is a no-op.
+
+    On entry: waits for the KV layer from the connector.
+    On exit: saves the KV layer to the connector.
+    """
+    # Import at runtime to avoid circular dependency
+    from vllm.attention.layer import get_attention_context
+
+    # Inspect the signature ONCE when the decorator is applied.
+    sig = inspect.signature(func)
+    param_names = list(sig.parameters.keys())
+
+    # Find the index of 'layer_name' parameter.
+    try:
+        layer_name_index = param_names.index("layer_name")
+    except ValueError as e:
+        raise TypeError(
+            f"Function {func.__name__} must have a 'layer_name' parameter"
+        ) from e
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+            return func(*args, **kwargs)
+
+        layer_name: str = args[layer_name_index]
+
+        # Extract attention context (layer-specific metadata, layer, and kv_cache)
+        attn_metadata, attn_layer, kv_cache = get_attention_context(layer_name)
+        connector = get_kv_transfer_group()
+        if attn_metadata is None or not connector.has_connector_metadata():
+            return func(*args, **kwargs)
+
+        # Wait for KV layer on entry
+        connector.wait_for_layer_load(layer_name)
+
+        # Execute the function
+        result = func(*args, **kwargs)
+
+        # Save KV cache layer on exit
+        connector.save_kv_layer(layer_name, kv_cache, attn_metadata)
+
+        return result
+
+    return wrapper
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index e58cf5911282..dddb050ec180 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -19,7 +19,6 @@
 import argparse
 import asyncio
 import contextlib
-import gc
 import importlib.util
 import json
 import os
@@ -49,6 +48,8 @@
 from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.gc_utils import freeze_gc_heap
+from vllm.utils.network_utils import join_host_port
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
@@ -1333,8 +1334,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         api_url = f"{args.base_url}{args.endpoint}"
         base_url = f"{args.base_url}"
     else:
-        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
-        base_url = f"http://{args.host}:{args.port}"
+        host_port = join_host_port(args.host, args.port)
+        api_url = f"http://{host_port}{args.endpoint}"
+        base_url = f"http://{host_port}"
 
     # Headers
     headers = None
@@ -1414,8 +1416,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     percentile_metrics: str = args.percentile_metrics or default_percentile_metrics
 
     # Avoid GC processing "static" data - reduce pause times.
-    gc.collect()
-    gc.freeze()
+    freeze_gc_heap()
 
     benchmark_result = await benchmark(
         task_type=task_type,
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index be69075f94f0..6b72cd015192 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -4,6 +4,7 @@
 import ast
 import dataclasses
 import hashlib
+import operator
 import os
 import pprint
 import time
@@ -22,6 +23,7 @@
     should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config.utils import Range
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -83,7 +85,7 @@ class CompilerManager:
     """
 
     def __init__(self, compilation_config: CompilationConfig):
-        self.cache: dict[tuple[int | None, int, str], Any] = dict()
+        self.cache: dict[tuple[Range, int, str], Any] = dict()
         self.is_cache_updated = False
         self.compilation_config = compilation_config
         self.compiler = make_compiler(compilation_config)
@@ -92,11 +94,11 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
         return self.compiler.compute_hash(vllm_config)
 
     @contextmanager
-    def compile_context(self, runtime_shape: int | None = None):
+    def compile_context(self, compile_range: Range):
         """Provide compilation context for the duration of compilation to set
         any torch global properties we want to scope to a single Inductor
         compilation (e.g. partition rules, pass context)."""
-        with pass_context(runtime_shape):
+        with pass_context(compile_range):
             if self.compilation_config.use_inductor_graph_partition:
                 with inductor_partition_rule_context(
                     self.compilation_config.splitting_ops
@@ -152,29 +154,21 @@ def load(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable | None:
-        if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
+        if (compile_range, graph_index, self.compiler.name) not in self.cache:
             return None
-        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
+        handle = self.cache[(compile_range, graph_index, self.compiler.name)]
         compiled_graph = self.compiler.load(
-            handle, graph, example_inputs, graph_index, runtime_shape
+            handle, graph, example_inputs, graph_index, compile_range
+        )
+        logger.debug(
+            "Directly load the %s-th graph for compile range %sfrom %s via handle %s",
+            graph_index,
+            str(compile_range),
+            self.compiler.name,
+            handle,
         )
-        if runtime_shape is None:
-            logger.debug(
-                "Directly load the %s-th graph for dynamic shape from %s via handle %s",
-                graph_index,
-                self.compiler.name,
-                handle,
-            )
-        else:
-            logger.debug(
-                "Directly load the %s-th graph for shape %s from %s via handle %s",
-                graph_index,
-                str(runtime_shape),
-                self.compiler.name,
-                handle,
-            )
         return compiled_graph
 
     def compile(
@@ -183,9 +177,9 @@ def compile(
         example_inputs,
         additional_inductor_config,
         compilation_config: CompilationConfig,
+        compile_range: Range,
         graph_index: int = 0,
         num_graphs: int = 1,
-        runtime_shape: int | None = None,
     ) -> Any:
         if graph_index == 0:
             # before compiling the first graph, record the start time
@@ -197,7 +191,7 @@ def compile(
         compiled_graph = None
 
         # try to load from the cache
-        compiled_graph = self.load(graph, example_inputs, graph_index, runtime_shape)
+        compiled_graph = self.load(graph, example_inputs, graph_index, compile_range)
         if compiled_graph is not None:
             if graph_index == num_graphs - 1:
                 # after loading the last graph for this shape, record the time.
@@ -205,19 +199,12 @@ def compile(
                 now = time.time()
                 elapsed = now - compilation_start_time
                 compilation_config.compilation_time += elapsed
-                if runtime_shape is None:
-                    logger.info(
-                        "Directly load the compiled graph(s) for dynamic shape "
-                        "from the cache, took %.3f s",
-                        elapsed,
-                    )
-                else:
-                    logger.info(
-                        "Directly load the compiled graph(s) for shape %s "
-                        "from the cache, took %.3f s",
-                        str(runtime_shape),
-                        elapsed,
-                    )
+                logger.info(
+                    "Directly load the compiled graph(s) for compile range %s "
+                    "from the cache, took %.3f s",
+                    str(compile_range),
+                    elapsed,
+                )
             return compiled_graph
 
         # no compiler cached the graph, or the cache is disabled,
@@ -226,14 +213,15 @@ def compile(
             # Let compile_fx generate a key for us
             maybe_key = None
         else:
-            maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
-
-        with self.compile_context(runtime_shape):
+            maybe_key = "artifact_compile_range_"
+            maybe_key += f"{compile_range.start}_{compile_range.end}"
+            maybe_key += f"_subgraph_{graph_index}"
+        with self.compile_context(compile_range):
             compiled_graph, handle = self.compiler.compile(
                 graph,
                 example_inputs,
                 additional_inductor_config,
-                runtime_shape,
+                compile_range,
                 maybe_key,
             )
 
@@ -241,55 +229,34 @@ def compile(
 
         # store the artifact in the cache
         if is_compile_cache_enabled(additional_inductor_config) and handle is not None:
-            self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle
+            self.cache[(compile_range, graph_index, self.compiler.name)] = handle
             compilation_counter.num_cache_entries_updated += 1
             self.is_cache_updated = True
             if graph_index == 0:
                 # adds some info logging for the first graph
-                if runtime_shape is None:
-                    logger.info_once(
-                        "Cache the graph for dynamic shape for later use", scope="local"
-                    )
-                else:
-                    logger.info_once(
-                        "Cache the graph of shape %s for later use",
-                        str(runtime_shape),
-                        scope="local",
-                    )
-            if runtime_shape is None:
-                logger.debug(
-                    "Store the %s-th graph for dynamic shape from %s via handle %s",
-                    graph_index,
-                    self.compiler.name,
-                    handle,
-                )
-            else:
-                logger.debug(
-                    "Store the %s-th graph for shape %s from %s via handle %s",
-                    graph_index,
-                    str(runtime_shape),
-                    self.compiler.name,
-                    handle,
+                logger.info_once(
+                    "Cache the graph of compile range %s for later use",
+                    str(compile_range),
                 )
+            logger.debug(
+                "Store the %s-th graph for compile range%s from %s via handle %s",
+                graph_index,
+                str(compile_range),
+                self.compiler.name,
+                handle,
+            )
 
         # after compiling the last graph, record the end time
         if graph_index == num_graphs - 1:
             now = time.time()
             elapsed = now - compilation_start_time
             compilation_config.compilation_time += elapsed
-            if runtime_shape is None:
-                logger.info_once(
-                    "Compiling a graph for dynamic shape takes %.2f s",
-                    elapsed,
-                    scope="local",
-                )
-            else:
-                logger.info_once(
-                    "Compiling a graph for shape %s takes %.2f s",
-                    runtime_shape,
-                    elapsed,
-                    scope="local",
-                )
+            logger.info_once(
+                "Compiling a graph for compile range %s takes %.2f s",
+                str(compile_range),
+                elapsed,
+                scope="local",
+            )
 
         return compiled_graph
 
@@ -307,12 +274,24 @@ def split_graph(
 ) -> tuple[fx.GraphModule, list[SplitItem]]:
     # split graph by ops
     subgraph_id = 0
-    node_to_subgraph_id = {}
-    split_op_graphs = []
+    node_to_subgraph_id: dict[fx.Node, int] = {}
+    split_op_graphs: list[int] = []
     for node in graph.graph.nodes:
         if node.op in ("output", "placeholder"):
             continue
 
+        # Check if this is a getitem operation on a node from an earlier subgraph.
+        # If so, assign it to the same subgraph as its input to avoid passing entire
+        # tuple as input to submodules, which is against standalone_compile and
+        # AoTAutograd input requirement.
+        if node.op == "call_function" and node.target == operator.getitem:
+            # Assign this getitem to the same subgraph as its input
+            input_node = node.args[0]
+            if input_node.op != "placeholder":
+                assert input_node in node_to_subgraph_id
+                node_to_subgraph_id[node] = node_to_subgraph_id[input_node]
+                continue
+
         if should_split(node, splitting_ops):
             subgraph_id += 1
             node_to_subgraph_id[node] = subgraph_id
@@ -405,19 +384,7 @@ def call_module(
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
-            global compilation_start_time
 
-            compiled_graph_for_dynamic_shape = (
-                self.vllm_backend.compiler_manager.compile(
-                    submod,
-                    args,
-                    self.compilation_config.inductor_compile_config,
-                    self.compilation_config,
-                    graph_index=index,
-                    num_graphs=len(self.compile_submod_names),
-                    runtime_shape=None,
-                )
-            )
             # Lazy import here to avoid circular import
             from .piecewise_backend import PiecewiseBackend
 
@@ -427,7 +394,6 @@ def call_module(
                 index,
                 len(self.compile_submod_names),
                 sym_shape_indices,
-                compiled_graph_for_dynamic_shape,
                 self.vllm_backend,
             )
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 69d4606d73eb..2717738dd7c2 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -10,6 +10,7 @@
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
 from vllm.config import VllmConfig
+from vllm.config.utils import Range
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank,
@@ -431,7 +432,7 @@ def __init__(self, config: VllmConfig):
 
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable(self, shape: int | None) -> bool:
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
         # This pass is applied on top of the sequence parallelism pass.
         # It inherits the same applicability condition as `SequenceParallelismPass`.
         # See `SequenceParallelismPass.is_applicable` for more details.
@@ -441,7 +442,7 @@ def is_applicable(self, shape: int | None) -> bool:
         ):
             return True
         tp_size = get_tensor_model_parallel_world_size()
-        return shape is not None and shape % tp_size == 0
+        return compile_range.is_single_size() and compile_range.end % tp_size == 0
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph):
@@ -505,91 +506,60 @@ def call_trtllm_fused_allreduce_norm(
         num_tokens, hidden_size = allreduce_in.shape
         element_size = allreduce_in.element_size()
         current_tensor_size = num_tokens * hidden_size * element_size
+        max_tensor_size = max_token_num * hidden_size * element_size
+        assert current_tensor_size <= max_tensor_size, (
+            f"Current tensor size {current_tensor_size} is larger than "
+            f"max token num {max_token_num} * hidden size {hidden_size} * "
+            f"element size {element_size}"
+        )
+        device_capability = current_platform.get_device_capability().to_int()
+        # Get one shot input size limit for the current world size
+        # for the current device capability
+        max_one_shot_size = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get(
+            device_capability, {}
+        ).get(world_size, None)
+        # Use one shot if no max size is specified
+        use_oneshot = (
+            max_one_shot_size is None or current_tensor_size <= max_one_shot_size * MiB
+        )
 
-        if num_tokens <= max_token_num:
-            device_capability = current_platform.get_device_capability().to_int()
-            # Get one shot input size limit for the current world size
-            # for the current device capability
-            max_one_shot_size_mb = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get(
-                device_capability, {}
-            ).get(world_size, None)
-            # Use one shot if no max size for one shot is specified
-            use_oneshot = (
-                max_one_shot_size_mb is None
-                or current_tensor_size <= max_one_shot_size_mb * MiB
-            )
-
-            assert _FI_WORKSPACE_TENSOR is not None, (
-                "Flashinfer must be enabled when using flashinfer"
-            )
-            if norm_out is None:
-                norm_out = allreduce_in
-                residual_out = residual
-            else:
-                # return residual_out as allreduce_out with zeroed residual_in
-                # as flashinfer does not support rms_norm
-                # and allreduce_out together
-                residual_out = allreduce_in
-            # For the sizes that are smaller than the max size,
-            # we only use flashinfer one shot allreduce
-            flashinfer_comm.trtllm_allreduce_fusion(
-                allreduce_in=allreduce_in,
-                token_num=allreduce_in.shape[0],
-                residual_in=residual,
-                residual_out=residual_out,
-                norm_out=norm_out,
-                rms_gamma=rms_gamma,
-                rms_eps=rms_eps,
-                world_rank=world_rank,
-                world_size=world_size,
-                hidden_dim=allreduce_in.shape[-1],
-                workspace_ptrs=_FI_WORKSPACE_TENSOR,
-                launch_with_pdl=launch_with_pdl,
-                use_oneshot=use_oneshot,
-                trigger_completion_at_end=trigger_completion_at_end,
-                fp32_acc=fp32_acc,
-                pattern_code=pattern_code,
-                allreduce_out=None,
-                quant_out=quant_out,
-                scale_out=scale_out,
-                # in vllm we only support swizzled layout
-                layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
-                scale_factor=scale_factor,
-            )
+        assert _FI_WORKSPACE_TENSOR is not None, (
+            "Flashinfer must be enabled when using flashinfer"
+        )
+        if norm_out is None:
+            norm_out = allreduce_in
+            residual_out = residual
         else:
-            allreduce_out = tensor_model_parallel_all_reduce(allreduce_in)
-            if scale_factor is not None and scale_out is None:
-                # Do fused rms norm static fp8 quant fused op
-                if norm_out is None:
-                    torch.ops._C.fused_add_rms_norm_static_fp8_quant(
-                        quant_out,
-                        allreduce_out,
-                        residual,
-                        rms_gamma,
-                        scale_factor,
-                        rms_eps,
-                    )
-                else:
-                    torch.ops._C.rms_norm_static_fp8_quant(
-                        quant_out, allreduce_out, rms_gamma, scale_factor, rms_eps
-                    )
-            else:
-                if norm_out is None:
-                    torch.ops._C.fused_add_rms_norm(
-                        allreduce_out, residual, rms_gamma, rms_eps
-                    )
-                    norm_out = allreduce_out
-                else:
-                    torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma, rms_eps)
-                if scale_factor is not None and scale_out is not None:
-                    torch.ops._C.scaled_fp4_quant(
-                        quant_out, norm_out, scale_out, scale_factor
-                    )
-            if scale_factor is None or norm_out is not None:
-                # we need to return allreduce output
-                # in cases of non quant fused AR + RMS norm
-                # and fused AR + RMS norm + quant without fused add
-                allreduce_in.copy_(allreduce_out)
+            # return residual_out as allreduce_out with zeroed residual_in
+            # as flashinfer does not support rms_norm
+            # and allreduce_out together
+            residual_out = allreduce_in
+        # For the sizes that are smaller than the max size,
+        # we only use flashinfer one shot allreduce
+        flashinfer_comm.trtllm_allreduce_fusion(
+            allreduce_in=allreduce_in,
+            token_num=allreduce_in.shape[0],
+            residual_in=residual,
+            residual_out=residual_out,
+            norm_out=norm_out,
+            rms_gamma=rms_gamma,
+            rms_eps=rms_eps,
+            world_rank=world_rank,
+            world_size=world_size,
+            hidden_dim=allreduce_in.shape[-1],
+            workspace_ptrs=_FI_WORKSPACE_TENSOR,
+            launch_with_pdl=launch_with_pdl,
+            use_oneshot=use_oneshot,
+            trigger_completion_at_end=trigger_completion_at_end,
+            fp32_acc=fp32_acc,
+            pattern_code=pattern_code,
+            allreduce_out=None,
+            quant_out=quant_out,
+            scale_out=scale_out,
+            # in vllm we only support swizzled layout
+            layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+            scale_factor=scale_factor,
+        )
 
     def call_trtllm_fused_allreduce_norm_fake(
         allreduce_in: torch.Tensor,
@@ -1128,7 +1098,8 @@ def __init__(self, config: VllmConfig):
         if max_size is None:
             # Flashinfer doesn't support current world size
             logger.warning(
-                "Flashinfer allreduce fusion is not supported for world size %s",
+                "Flashinfer allreduce fusion is not supported for world size %s"
+                " or max size is not provided",
                 self.tp_size,
             )
             return
@@ -1216,6 +1187,9 @@ def register_patterns(self):
 
         self.disabled = False
 
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
+        return compile_range.end <= self.max_token_num
+
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph):
         if self.disabled:
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index b0cdb08884a3..97c01f5fb2aa 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -16,6 +16,7 @@
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
+from vllm.config.utils import Range
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
@@ -63,16 +64,16 @@ def compile(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         """
         Compile the graph with the given example inputs and compiler config,
-        with a runtime shape. If the `runtime_shape` is None, it means
-        the `example_inputs` have a dynamic shape. Otherwise, the
-        `runtime_shape` specifies the shape of the inputs. Right now we only
-        support one variable shape for all inputs, which is the batchsize
-        (number of tokens) during inference.
+        with a range. The `compile_range` specifies the range of the inputs,
+        it could be concrete size (if compile_sizes is provided), e.g. [4, 4)
+        or a range [4, 5).
+        Right now we only support one variable in ranges for all inputs,
+         which is the batchsize (number of tokens) during inference.
 
         Dynamo will make sure `graph(*example_inputs)` is valid.
 
@@ -98,7 +99,7 @@ def load(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable:
         """
         Load the compiled function from the handle.
@@ -212,20 +213,20 @@ def compile(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
         current_config = {}
         if compiler_config is not None:
             current_config.update(compiler_config)
-        set_inductor_config(current_config, runtime_shape)
+        set_inductor_config(current_config, compile_range)
         set_functorch_config()
 
-        if isinstance(runtime_shape, int):
+        if compile_range.is_single_size():
             dynamic_shapes = "from_example_inputs"
         else:
-            dynamic_shapes = "from_tracing_context"
+            dynamic_shapes = "from_graph"
 
         from torch._inductor import standalone_compile
 
@@ -235,7 +236,6 @@ def compile(
             dynamic_shapes=dynamic_shapes,
             options={"config_patches": current_config},
         )
-
         # Save the compiled artifact to disk in the specified path
         assert key is not None
         path = os.path.join(self.cache_dir, key)
@@ -251,7 +251,7 @@ def load(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
@@ -299,7 +299,7 @@ def initialize_cache(
         self.base_cache_dir = cache_dir[: -len(prefix)] if prefix else cache_dir
         if disable_cache:
             return
-        # redirect the cache directory to a sub-directory
+        # redirect the cache directory to a subdirectory
         # set flags so that Inductor and Triton store their cache
         # in the cache_dir, then users only need to copy the cache_dir
         # to another machine to reuse the cache.
@@ -315,7 +315,7 @@ def compile(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
@@ -329,7 +329,7 @@ def compile(
         current_config["fx_graph_cache"] = True
         current_config["fx_graph_remote_cache"] = False
 
-        set_inductor_config(current_config, runtime_shape)
+        set_inductor_config(current_config, compile_range)
         set_functorch_config()
 
         # inductor can inplace modify the graph, so we need to copy it
@@ -512,7 +512,7 @@ def load(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: Range,
     ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
@@ -608,9 +608,9 @@ def metrics_context(self) -> contextlib.AbstractContextManager:
             return contextlib.nullcontext()
 
 
-def set_inductor_config(config, runtime_shape):
-    if isinstance(runtime_shape, int):
-        # for a specific batchsize, tuning triton kernel parameters
+def set_inductor_config(config, compile_range: Range):
+    if compile_range.is_single_size():
+        # for a specific batch size, tuning triton kernel parameters
         # can be beneficial
         config["max_autotune"] = envs.VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE
         config["coordinate_descent_tuning"] = (
@@ -630,7 +630,7 @@ def compile(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: int | None = None,
+        compile_range: Range,
         key: str | None = None,
     ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_eager_compiles += 1
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 0946fa69171b..11a18c0e6bb7 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -17,7 +17,7 @@
 
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
 from vllm.config import (
     CompilationMode,
     VllmConfig,
@@ -159,7 +159,7 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
 
     `mark_unbacked_dims` is a dictionary that maps argument names with a dynamic
     dim to be decorated with `mark_unbacked`.  This is useful if we would like to
-    enforce that dynamo do not specialize on 0/1 values in the case of dummy input
+    enforce that dynamo does not specialize on 0/1 values in the case of dummy input
     such as for vision model compilation
     """
 
@@ -246,14 +246,14 @@ def _support_torch_compile(
     """
     A decorator to add support for compiling the forward method of a class.
     """
-    if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
+    if TorchCompileWithNoGuardsWrapper in cls.__bases__:
         # support decorating multiple times
         return cls
 
     # take care of method resolution order
     # make sure super().__init__ is called on the base class
-    #  other than TorchCompileWrapperWithCustomDispatcher
-    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher,)
+    #  other than TorchCompileWithNoGuardsWrapper
+    cls.__bases__ = cls.__bases__ + (TorchCompileWithNoGuardsWrapper,)
 
     old_init = cls.__init__
 
@@ -290,12 +290,43 @@ def __init__(
             return
 
         compilation_counter.num_models_seen += 1
-        TorchCompileWrapperWithCustomDispatcher.__init__(
-            self, compilation_mode=vllm_config.compilation_config.mode
-        )
+        self.compiled = False
+        TorchCompileWithNoGuardsWrapper.__init__(self)
 
     cls.__init__ = __init__
 
+    def _mark_dynamic_inputs(mod, *args, **kwargs):
+        sig = inspect.signature(mod.__class__.forward)
+        bound_args = sig.bind(mod, *args, **kwargs)
+        bound_args.apply_defaults()
+        for k, dims in dynamic_arg_dims.items():
+            arg = bound_args.arguments.get(k)
+            if arg is not None:
+                dims = [dims] if isinstance(dims, int) else dims
+                if isinstance(arg, torch.Tensor):
+                    # In case dims is specified with negative indexing
+                    dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                    torch._dynamo.mark_dynamic(arg, dims)
+                elif isinstance(arg, IntermediateTensors):
+                    for tensor in arg.tensors.values():
+                        # In case dims is specified with negative indexing
+                        dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
+                        torch._dynamo.mark_dynamic(tensor, dims)
+                else:
+                    raise ValueError(
+                        "Unsupported dynamic dimensions"
+                        f" {dims} for argument {k} with type {type(arg)}."
+                    )
+        if mark_unbacked_dims:
+            for k, dims in mark_unbacked_dims.items():
+                arg = bound_args.arguments.get(k)
+                if arg is not None:
+                    dims = [dims] if isinstance(dims, int) else dims
+                    if isinstance(arg, torch.Tensor):
+                        # In case dims is specified with negative indexing
+                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                        torch._dynamo.decorators.mark_unbacked(arg, dims)
+
     def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
@@ -303,6 +334,7 @@ def __call__(self, *args, **kwargs):
         if self.do_not_compile or torch.compiler.is_compiling():
             return self.forward(*args, **kwargs)
 
+        # if aot_compiled_fn is set, just call it.
         if getattr(self, "aot_compiled_fn", None) is not None:
             return self.aot_compiled_fn(self, *args, **kwargs)
 
@@ -362,120 +394,84 @@ def __call__(self, *args, **kwargs):
                 )
                 return self.aot_compiled_fn(self, *args, **kwargs)
 
+        if self.compiled:
+            assert not envs.VLLM_USE_AOT_COMPILE
+            return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
+
+        # This is the path for the first compilation.
+
         # the first compilation needs to have dynamic shapes marked
-        if len(self.compiled_codes) < 1:
-            sig = inspect.signature(self.__class__.forward)
-            bound_args = sig.bind(self, *args, **kwargs)
-            bound_args.apply_defaults()
-            for k, dims in dynamic_arg_dims.items():
-                arg = bound_args.arguments.get(k)
-                if arg is not None:
-                    dims = [dims] if isinstance(dims, int) else dims
-                    if isinstance(arg, torch.Tensor):
-                        # In case dims is specified with negative indexing
-                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                        torch._dynamo.mark_dynamic(arg, dims)
-                    elif isinstance(arg, IntermediateTensors):
-                        for tensor in arg.tensors.values():
-                            # In case dims is specified with negative indexing
-                            dims = [
-                                tensor.ndim + dim if dim < 0 else dim for dim in dims
-                            ]
-                            torch._dynamo.mark_dynamic(tensor, dims)
-                    else:
-                        raise ValueError(
-                            "Unsupported dynamic dimensions"
-                            f" {dims} for argument {k} with type {type(arg)}."
-                        )
-            if mark_unbacked_dims:
-                for k, dims in mark_unbacked_dims.items():
-                    arg = bound_args.arguments.get(k)
-                    if arg is not None:
-                        dims = [dims] if isinstance(dims, int) else dims
-                        if isinstance(arg, torch.Tensor):
-                            # In case dims is specified with negative indexing
-                            dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                            torch._dynamo.decorators.mark_unbacked(arg, dims)
-            # here, it is the starting point of the `torch.compile` process
-            start_monitoring_torch_compile(self.vllm_config)
-            logger.debug("Start compiling function %s", self.original_code_object)
-
-        # if we don't use custom dispatcher, we can directly call the
-        # compiled function and let torch.compile handle the dispatching,
-        # with the overhead of guard evaluation and recompilation.
-        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
-            # it seems Dynamo reuse the compilation across instances,
-            # while we need to make sure the compiled code is not reused.
-            # we need to control all the compilation of the model.
-            torch._dynamo.eval_frame.remove_from_cache(self.original_code_object)
-
-            # collect all relevant files traced by Dynamo,
-            # so that the compilation cache can trigger re-compilation
-            # properly when any of these files change.
-
-            # 1. the file containing the top-level forward function
-            self.vllm_config.compilation_config.traced_files.add(
-                self.original_code_object.co_filename
-            )
+        _mark_dynamic_inputs(self, *args, **kwargs)
 
-            # 2. every time Dynamo sees a function call, it will inline
-            # the function by calling InliningInstructionTranslator.inline_call_
-            # we hijack this function to know all the functions called
-            # during Dynamo tracing, and their corresponding files
-            inline_call = InliningInstructionTranslator.inline_call_
-
-            def patched_inline_call(self_):
-                code = self_.f_code
-                self.vllm_config.compilation_config.traced_files.add(code.co_filename)
-                return inline_call(self_)
-
-            # Disable the C++ compilation of symbolic shape guards. C++-fication
-            # of symbolic shape guards can improve guard overhead. But, since
-            # vllm skip guards anyways, setting this flag to False can improve
-            # compile time.
-            dynamo_config_patches = {}
-            try:
-                _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
-                dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False
-            except AttributeError:
-                # Note: this config is not available in torch 2.6, we can skip
-                # if the config doesn't exist
-                logger.debug("enable_cpp_symbolic_shape_guards config not available")
-
-            with (
-                patch.object(
-                    InliningInstructionTranslator, "inline_call_", patched_inline_call
-                ),
-                torch._dynamo.config.patch(**dynamo_config_patches),
-                maybe_use_cudagraph_partition_wrapper(self.vllm_config),
-                _torch27_patch_tensor_subclasses(),
-            ):
-                if envs.VLLM_USE_AOT_COMPILE:
-                    self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
-                    output = self.aot_compiled_fn(self, *args, **kwargs)
-                    assert aot_compilation_path is not None
-                    assert cache_dir is not None
-                    try:
-                        os.makedirs(cache_dir, exist_ok=True)
-                        self.aot_compiled_fn.save_compiled_function(
-                            aot_compilation_path
-                        )
-                    except Exception as e:
-                        logger.warning(
-                            "Cannot save aot compilation to path %s, error: %s",
-                            aot_compilation_path,
-                            str(e),
-                        )
-                else:
-                    output = self.compiled_callable(*args, **kwargs)
-            return output
+        # here, it is the starting point of the `torch.compile` process
+        start_monitoring_torch_compile(self.vllm_config)
+        original_code_object = self.original_code_object()
+        logger.debug("Start compiling function %s", original_code_object)
+
+        # we do not want tp delete the original code object entries since
+        # we depend on them now to look up cached compiled functions.
+        # torch._dynamo.eval_frame.remove_from_cache(original_code_object)
+
+        # collect all relevant files traced by Dynamo,
+        # so that the compilation cache can trigger re-compilation
+        # properly when any of these files change.
+
+        # 1. the file containing the top-level forward function
+        self.vllm_config.compilation_config.traced_files.add(
+            original_code_object.co_filename
+        )
+
+        # 2. every time Dynamo sees a function call, it will inline
+        # the function by calling InliningInstructionTranslator.inline_call_
+        # we hijack this function to know all the functions called
+        # during Dynamo tracing, and their corresponding files
+        inline_call = InliningInstructionTranslator.inline_call_
+
+        def patched_inline_call(self_):
+            code = self_.f_code
+            self.vllm_config.compilation_config.traced_files.add(code.co_filename)
+            return inline_call(self_)
+
+        # Disable the C++ compilation of symbolic shape guards. C++-fication
+        # of symbolic shape guards can improve guard overhead. But, since
+        # vllm skip guards anyways, setting this flag to False can improve
+        # compile time.
+        dynamo_config_patches = {}
+        try:
+            _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
+            dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False
+        except AttributeError:
+            # Note: this config is not available in torch 2.6, we can skip
+            # if the config doesn't exist
+            logger.debug("enable_cpp_symbolic_shape_guards config not available")
+
+        with (
+            patch.object(
+                InliningInstructionTranslator, "inline_call_", patched_inline_call
+            ),
+            torch._dynamo.config.patch(**dynamo_config_patches),
+            maybe_use_cudagraph_partition_wrapper(self.vllm_config),
+            _torch27_patch_tensor_subclasses(),
+        ):
+            if envs.VLLM_USE_AOT_COMPILE:
+                self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
+                output = self.aot_compiled_fn(self, *args, **kwargs)
+                assert aot_compilation_path is not None
+                assert cache_dir is not None
+                try:
+                    os.makedirs(cache_dir, exist_ok=True)
+                    self.aot_compiled_fn.save_compiled_function(aot_compilation_path)
+                except Exception as e:
+                    logger.warning(
+                        "Cannot save aot compilation to path %s, error: %s",
+                        aot_compilation_path,
+                        str(e),
+                    )
+            else:
+                output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 
-        # usually, capturing the model once is enough, and then we can
-        # dispatch to the compiled code directly, without going through
-        # the Dynamo guard mechanism.
-        with self.dispatch_to_code(0):
-            model_output = self.forward(*args, **kwargs)
-            return model_output
+        self.compiled = True
+        return output
 
     cls.__call__ = __call__
     return cls
@@ -487,7 +483,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
     Context manager to set/unset customized cudagraph partition wrappers.
 
     If we're using Inductor-based graph partitioning, we currently have the
-    whole `fx.Graph` before Inductor lowering and and the piecewise
+    whole `fx.Graph` before Inductor lowering and the piecewise
     splitting happens after all graph passes and fusions. Here, we add
     a custom hook for Inductor to wrap each partition with our static
     graph wrapper class to maintain more control over static graph
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 29462d9ff0e5..126ad35e527a 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -132,6 +132,23 @@ def __call__(self, graph: torch.fx.Graph):
                         "input_global_scale",
                     ),
                 )
+            # Defunctionalize fused_qk_norm_rope to remove higher-order wrapper.
+            elif at_target == torch.ops._C.fused_qk_norm_rope.default:
+                mutated_args = {1: "qkv"}
+                args = (
+                    "qkv",
+                    "num_heads_q",
+                    "num_heads_k",
+                    "num_heads_v",
+                    "head_dim",
+                    "eps",
+                    "q_weight",
+                    "k_weight",
+                    "cos_sin_cache",
+                    "is_neox",
+                    "position_ids",
+                )
+                self.defunctionalize(graph, node, mutated_args=mutated_args, args=args)
             else:
                 continue  # skip the count
 
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 8f0ad2d69fbe..1d6e297b495e 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -44,6 +44,10 @@ def empty_i32(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.int32, device="cuda")
 
 
+def empty_i64(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.int64, device="cuda")
+
+
 RMS_OP = torch.ops._C.rms_norm.default
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
 
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 9af635a929b4..8159b817f637 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -14,6 +14,7 @@
 from torch import fx
 from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
 
+from vllm.config.utils import Range
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 if is_torch_equal_or_newer("2.6"):
@@ -28,8 +29,8 @@
 
 
 class PassContext:
-    def __init__(self, runtime_shape: int | None):
-        self.runtime_shape = runtime_shape
+    def __init__(self, compile_range: Range):
+        self.compile_range: Range = compile_range
 
 
 def get_pass_context() -> PassContext:
@@ -39,13 +40,13 @@ def get_pass_context() -> PassContext:
 
 
 @contextmanager
-def pass_context(runtime_shape: int | None):
+def pass_context(compile_range: Range):
     """A context manager that stores the current pass context,
     usually it is a list of sizes to specialize.
     """
     global _pass_context
     prev_context = _pass_context
-    _pass_context = PassContext(runtime_shape)
+    _pass_context = PassContext(compile_range)
     try:
         yield
     finally:
@@ -96,7 +97,7 @@ def hash_dict(dict_: dict[Any, Any]):
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
 
-    def is_applicable(self, shape: int | None):
+    def is_applicable_for_range(self, compile_range: Range):
         return True
 
 
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index 383fe6033a6d..38eb4e5301a1 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -18,10 +18,13 @@
     kFp8StaticTensorSym,
     kNvfp4Quant,
 )
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform
 
 RMS_OP = torch.ops._C.rms_norm.default
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+ROTARY_OP = torch.ops._C.rotary_embedding.default
+FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default
 
 QUANT_OPS: dict[QuantKey, OpOverload] = {
     kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
@@ -58,6 +61,9 @@ def __call__(self, *args, **kws):
     def empty(self, *args, **kws):
         return torch.empty(*args, dtype=self.model_dtype, device=self.device, **kws)
 
+    def empty_int64(self, *args, **kws):
+        return torch.empty(*args, dtype=torch.int64, device=self.device, **kws)
+
     def empty_f32(self, *args, **kws):
         return torch.empty(*args, dtype=torch.float32, device=self.device, **kws)
 
@@ -66,6 +72,77 @@ def inputs(self) -> list[torch.Tensor]:
         raise NotImplementedError
 
 
+class MatcherRotaryEmbedding(MatcherCustomOp):
+    def __init__(
+        self,
+        is_neox: bool,
+        head_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        use_flashinfer: bool = False,
+        enabled: bool | None = None,
+    ) -> None:
+        if enabled is None:
+            enabled = RotaryEmbedding.enabled()
+
+        super().__init__(enabled)
+        self.is_neox = is_neox
+        self.head_size = head_size
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.q_size = self.num_heads * self.head_size
+        self.kv_size = self.num_kv_heads * self.head_size
+        self.rotary_dim = head_size
+        if use_flashinfer:
+            self.rotary_op = FLASHINFER_ROTARY_OP
+        else:
+            self.rotary_op = ROTARY_OP
+
+    def inputs(self) -> list[torch.Tensor]:
+        positions = self.empty_int64(5)
+        query = self.empty(5, self.q_size)
+        key = self.empty(5, self.kv_size)
+        cos_sin_cache = self.empty(4096, self.rotary_dim)
+        return [positions, query, key, cos_sin_cache]
+
+    def forward_custom(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        cos_sin_cache: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        result = auto_functionalized(
+            self.rotary_op,
+            positions=positions,
+            query=query,
+            key=key,
+            head_size=self.head_size,
+            cos_sin_cache=cos_sin_cache,
+            is_neox=self.is_neox,
+        )
+        query_out = result[1]
+        key_out = result[2] if len(result) > 2 else None
+        return query_out, key_out
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        cos_sin_cache: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return RotaryEmbedding.forward_static(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.rotary_dim,
+            cos_sin_cache,
+            self.is_neox,
+        )
+
+
 class MatcherRMSNorm(MatcherCustomOp):
     def __init__(self, epsilon: float, enabled: bool | None = None):
         if enabled is None:
@@ -85,10 +162,12 @@ def forward_custom(
         weight: torch.Tensor,
     ) -> torch.Tensor:
         result = torch.empty_like(input)
+        # TODO: support non-contiguous input for RMSNorm and remove this
+        input_contiguous = input.contiguous()
         _, result = auto_functionalized(
             RMS_OP,
             result=result,
-            input=input,
+            input=input_contiguous,
             weight=weight,
             epsilon=self.epsilon,
         )
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index dfda2adf1d3b..0378e2641638 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -17,6 +17,8 @@
     from .activation_quant_fusion import ActivationQuantFusionPass
     from .fusion import RMSNormQuantFusionPass
     from .fusion_attn import AttnFusionPass
+    from .qk_norm_rope_fusion import QKNormRoPEFusionPass
+    from .sequence_parallelism import SequenceParallelismPass
 
 if current_platform.is_cuda():
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
@@ -24,7 +26,6 @@
 from .fix_functionalization import FixFunctionalizationPass
 from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
 from .noop_elimination import NoOpEliminationPass
-from .sequence_parallelism import SequenceParallelismPass
 
 logger = init_logger(__name__)
 
@@ -69,13 +70,13 @@ def __init__(self):
     def __call__(self, graph: fx.Graph):
         VllmInductorPass.dump_prefix = 0  # reset dump index
 
-        shape = get_pass_context().runtime_shape
+        compile_range = get_pass_context().compile_range
         for pass_ in self.passes:
-            if pass_.is_applicable(shape):
+            if pass_.is_applicable_for_range(compile_range):
                 pass_(graph)
                 VllmInductorPass.dump_prefix += 1
             else:
-                logger.debug("Skipping %s with shape %s", pass_, shape)
+                logger.debug("Skipping %s with compile range %s", pass_, compile_range)
 
         # post-cleanup goes before fix_functionalization
         # because it requires a functional graph
@@ -109,6 +110,9 @@ def configure(self, config: VllmConfig):
             if self.pass_config.enable_attn_fusion:
                 self.passes += [AttnFusionPass(config)]
 
+            if self.pass_config.enable_qk_norm_rope_fusion:
+                self.passes += [QKNormRoPEFusionPass(config)]
+
             # needs a functional graph
             self.post_cleanup = PostCleanupPass(config)
             self.fix_functionalization = FixFunctionalizationPass(config)
@@ -127,5 +131,9 @@ def uuid(self):
         for pass_ in self.passes:
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
+        compile_range = get_pass_context().compile_range
+        # Include the compile range in the uuid to ensure that inductor
+        # recompiles the graph for the new dynamic compile range.
+        state["compile_range"] = str(compile_range)
 
         return InductorPass.hash_dict(state)
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 2931580afbbb..d53fa62bdc11 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -7,18 +7,18 @@
 
 import torch.fx as fx
 
-import vllm.envs as envs
 from vllm.compilation.backends import VllmBackend
 from vllm.compilation.monitor import end_monitoring_torch_compile
 from vllm.config import VllmConfig
+from vllm.config.compilation import Range
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
 @dataclasses.dataclass
-class ConcreteSizeEntry:
-    runtime_shape: int
+class RangeEntry:
+    compile_range: Range
     compiled: bool = False
     runnable: Callable = None  # type: ignore
 
@@ -31,7 +31,6 @@ def __init__(
         piecewise_compile_index: int,
         total_piecewise_compiles: int,
         sym_shape_indices: list[int],
-        compiled_graph_for_general_shape: Callable,
         vllm_backend: VllmBackend,
     ):
         """
@@ -55,67 +54,125 @@ def __init__(
 
         self.is_full_graph = total_piecewise_compiles == 1
 
-        self.compile_sizes: set[int] = set(self.compilation_config.compile_sizes)
+        self.compile_ranges = self.compilation_config.get_compile_ranges()
+        log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
+        logger.debug_once(log_string)
 
-        self.first_run_finished = False
+        self.compile_sizes = self.compilation_config.compile_sizes
+        log_string = f"PiecewiseBackend: compile_sizes: {self.compile_sizes}"
+        logger.debug_once(log_string)
 
-        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
+        self.first_run_finished = False
 
         self.sym_shape_indices = sym_shape_indices
 
-        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
-
         # the entries for different shapes that we need to compile
-        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
+        # self.concrete_size_entries: dict[int, RangeEntry] = {}
 
-        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # the entries for ranges that we need to either
+        self.range_entries: dict[Range, RangeEntry] = {}
+
+        # to_be_compiled_ranges tracks the remaining ranges to compile,
         # and updates during the compilation process, so we need to copy it
-        self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
+        self.to_be_compiled_ranges: set[Range] = set(self.compile_ranges)
 
         # We only keep compilation management inside this class directly.
-        for shape in self.compile_sizes:
-            self.concrete_size_entries[shape] = ConcreteSizeEntry(
-                runtime_shape=shape,
-                runnable=self.compiled_graph_for_general_shape,
+        for size in self.compile_sizes:
+            range = Range(start=size, end=size)
+            if range not in self.compile_ranges:
+                self.range_entries[range] = RangeEntry(
+                    compile_range=range,
+                )
+                self.to_be_compiled_ranges.add(range)
+
+        for range in self.compile_ranges:
+            self.range_entries[range] = RangeEntry(
+                compile_range=range,
             )
 
     def check_for_ending_compilation(self):
-        if self.is_last_graph and not self.to_be_compiled_sizes:
+        if self.is_last_graph and not self.to_be_compiled_ranges:
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
             self.vllm_backend.compiler_manager.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
-    def __call__(self, *args) -> Any:
-        if not self.first_run_finished:
-            self.first_run_finished = True
-            self.check_for_ending_compilation()
-            return self.compiled_graph_for_general_shape(*args)
-
-        runtime_shape = args[self.sym_shape_indices[0]]
-
-        if runtime_shape not in self.concrete_size_entries:
-            # we don't need to do anything for this shape
-            return self.compiled_graph_for_general_shape(*args)
+    def fakify_args(self, args: list[Any]) -> list[Any]:
+        # We need to pass fake example_inputs, otherwise torch.compile
+        # will fakify the example_inputs potentially causing some non dynamic
+        # dimension to be be duck shaped to other existing shapes that have hints
+        # matching their values.
+        # This is problem because it can lead to unintended specializations!
+        # if the new wrongly dynamic dim is specialized
+        # it will force specializing the whole shape
+        # torch.compile probably should not accept
+        # non fake tensors as example inputs!
+        fake_example_inputs = []
+        for node in self.graph.graph.nodes:
+            # All place holders come first
+            if node.op == "placeholder":
+                fake_example_inputs.append(node.meta["example_value"])
+            else:
+                break
+        assert len(fake_example_inputs) == len(args)
+        return fake_example_inputs
+
+    def _maybe_compile_for_range_entry(self, range_entry: RangeEntry, args) -> Any:
+        if not range_entry.compiled:
+            range_entry.compiled = True
+            self.to_be_compiled_ranges.remove(range_entry.compile_range)
 
-        entry = self.concrete_size_entries[runtime_shape]
-
-        if not entry.compiled:
-            entry.compiled = True
-            self.to_be_compiled_sizes.remove(runtime_shape)
             # args are real arguments
-            entry.runnable = self.vllm_backend.compiler_manager.compile(
+            # fakify for range, real args for concrete size
+            args = (
+                self.fakify_args(args)
+                if not range_entry.compile_range.is_single_size()
+                else args
+            )
+            range_entry.runnable = self.vllm_backend.compiler_manager.compile(
                 self.graph,
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
+                compile_range=range_entry.compile_range,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
-                runtime_shape=runtime_shape,
             )
 
             # finished compilations for all required shapes
-            if self.is_last_graph and not self.to_be_compiled_sizes:
-                self.check_for_ending_compilation()
+            self.check_for_ending_compilation()
+
+    def __call__(self, *args) -> Any:
+        if not self.first_run_finished:
+            self.first_run_finished = True
+            self.check_for_ending_compilation()
+
+            # Role of the general graph is taken by the last range graph
+            range_entry = self.range_entries[self.compile_ranges[-1]]
+            self._maybe_compile_for_range_entry(range_entry, args)
+            return range_entry.runnable(*args)
+        runtime_shape = args[self.sym_shape_indices[0]]
 
-        return entry.runnable(*args)
+        # First we try to find the range entry for the concrete compile size
+        # If not found, we search for the range entry
+        # that contains the runtime shape.
+        range_found = False
+        if runtime_shape in self.compile_sizes:
+            range_entry = self.range_entries[
+                Range(start=runtime_shape, end=runtime_shape)
+            ]
+            range_found = True
+        else:
+            for range in self.compile_ranges:
+                if runtime_shape in range:
+                    range_entry = self.range_entries[range]
+                    range_found = True
+                    break
+        assert range_found, (
+            f"Shape out of considered range: {runtime_shape} "
+            "[1, max_num_batched_tokens]"
+        )
+
+        self._maybe_compile_for_range_entry(range_entry, args)
+
+        return range_entry.runnable(*args)
diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py
new file mode 100644
index 000000000000..e3c399e07906
--- /dev/null
+++ b/vllm/compilation/qk_norm_rope_fusion.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+import torch._inductor.pattern_matcher as pm
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import PatternMatcherPass
+
+from vllm.attention import Attention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.logger import init_logger
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
+from .fusion import empty_bf16, empty_fp32, empty_i64
+from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherRMSNorm, MatcherRotaryEmbedding
+from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
+
+logger = init_logger(__name__)
+
+FUSED_QK_ROPE_OP = torch.ops._C.fused_qk_norm_rope.default
+
+
+class QkNormRopePattern:
+    """
+    Match the unfused sequence in attention blocks and replace with the fused op.
+
+    Unfused (conceptually):
+      q, k, v = split(qkv, [qsz, kvsz, kvsz], -1)
+      qh = reshape(q, [-1, num_heads, head_dim])
+      kh = reshape(k, [-1, num_kv_heads, head_dim])
+      qn = rms_norm(qh, q_weight, eps)
+      kn = rms_norm(kh, k_weight, eps)
+      qf = reshape(qn, [-1, num_heads * head_dim])
+      kf = reshape(kn, [-1, num_kv_heads * head_dim])
+      qf, kf = rotary_embedding(positions, qf, kf, head_dim, cos_sin_cache, is_neox)
+      return qf, kf, v
+
+    Fused replacement:
+      fused_qk_norm_rope(qkv, num_heads, num_kv_heads, num_kv_heads, head_dim,
+                         eps, q_weight, k_weight, cos_sin_cache, is_neox,
+                         positions.view(-1))
+      return split(qkv, [qsz, kvsz, kvsz], -1)
+    """
+
+    def __init__(
+        self,
+        head_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        eps: float,
+        is_neox: bool,
+        rope_flashinfer: bool = False,
+    ) -> None:
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.eps = eps
+        self.rmsnorm_matcher = MatcherRMSNorm(eps)
+        self.is_neox = is_neox
+        self.rope_flashinfer = rope_flashinfer
+        self.rope_matcher = MatcherRotaryEmbedding(
+            is_neox=is_neox,
+            head_size=self.head_dim,
+            num_heads=self.num_heads,
+            num_kv_heads=self.num_kv_heads,
+            use_flashinfer=self.rope_flashinfer,
+        )
+
+    def get_inputs(self):
+        # Sample inputs to help pattern tracing
+        T = 5
+        qkv = empty_bf16(T, self.q_size + 2 * self.kv_size)
+        positions = empty_i64(T)
+        q_weight = empty_bf16(1, self.head_dim)
+        k_weight = empty_bf16(1, self.head_dim)
+        if self.rope_flashinfer:
+            cos_sin_cache = empty_fp32(4096, self.head_dim)
+        else:
+            cos_sin_cache = empty_bf16(4096, self.head_dim)
+        return [
+            qkv,
+            positions,
+            q_weight,
+            k_weight,
+            cos_sin_cache,
+        ]
+
+    @staticmethod
+    def wrap_trace_fn(trace_fn, *process_fx_fns: Callable[[fx.GraphModule], None]):
+        def wrapped(*args, **kwargs):
+            gm = trace_fn(*args, **kwargs)
+            for process_fx in process_fx_fns:
+                process_fx(gm)
+
+            return gm
+
+        return wrapped
+
+    @staticmethod
+    def fx_view_to_reshape(gm: torch.fx.GraphModule):
+        from torch._inductor.fx_passes.post_grad import view_to_reshape
+
+        view_to_reshape(gm)
+
+    def register(self, pm_pass: PatternMatcherPass):
+        def pattern(
+            qkv: torch.Tensor,
+            positions: torch.Tensor,
+            q_weight: torch.Tensor,
+            k_weight: torch.Tensor,
+            cos_sin_cache: torch.Tensor,
+        ):
+            # split qkv -> q,k,v
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+            # Q path: view -> RMS -> view back to q.shape
+            q_by_head = q.view(
+                *q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim
+            )
+            q_normed_by_head = self.rmsnorm_matcher(q_by_head, q_weight)
+            q_flat = q_normed_by_head.view(q.shape)
+
+            # K path: view -> RMS -> view back to k.shape
+            k_by_head = k.view(
+                *k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim
+            )
+            k_normed_by_head = self.rmsnorm_matcher(k_by_head, k_weight)
+            k_flat = k_normed_by_head.view(k.shape)
+
+            # RoPE: apply to flattened q/k
+            q_rope, k_rope = self.rope_matcher(positions, q_flat, k_flat, cos_sin_cache)
+            return q_rope, k_rope, v
+
+        def replacement(
+            qkv: torch.Tensor,
+            positions: torch.Tensor,
+            q_weight: torch.Tensor,
+            k_weight: torch.Tensor,
+            cos_sin_cache: torch.Tensor,
+        ):
+            # Run fused qk_norm_rope op
+            result = auto_functionalized(
+                FUSED_QK_ROPE_OP,
+                qkv=qkv,
+                num_heads_q=self.num_heads,
+                num_heads_k=self.num_kv_heads,
+                num_heads_v=self.num_kv_heads,
+                head_dim=self.head_dim,
+                eps=self.eps,
+                q_weight=q_weight,
+                k_weight=k_weight,
+                cos_sin_cache=cos_sin_cache,
+                is_neox=self.is_neox,
+                position_ids=positions.view(-1),
+            )
+            result_qkv = result[1]
+
+            # Split back to q,k,v and return
+            return result_qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # NOTE: use fx_view_to_reshape to unify view/reshape to simplify
+        # pattern and increase matching opportunities
+        pm.register_replacement(
+            pattern,
+            replacement,
+            self.get_inputs(),
+            QkNormRopePattern.wrap_trace_fn(
+                pm.fwd_only,
+                QkNormRopePattern.fx_view_to_reshape,
+            ),
+            pm_pass,
+        )
+
+
+class QKNormRoPEFusionPass(VllmPatternMatcherPass):
+    """Fuse Q/K RMSNorm + RoPE into fused_qk_norm_rope when the custom op exists."""
+
+    @enable_fake_mode
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="qk_norm_rope_fusion_pass"
+        )
+
+        dtype = config.model_config.dtype
+        if dtype not in (torch.bfloat16, torch.float16):
+            logger.warning_once(
+                "QK Norm+RoPE fusion not enabled: unsupported dtype %s", dtype
+            )
+            return
+
+        # use one attn layer to get meta (such as head_dim) for QkNormRopePattern
+        attn_layers: dict[str, Attention] = get_layers_from_vllm_config(
+            config, Attention
+        )
+        if len(attn_layers) == 0:
+            logger.warning_once(
+                "QK Norm+RoPE fusion enabled, but no Attention layers were discovered."
+            )
+            return
+        layer = next(iter(attn_layers.values()))
+
+        for epsilon in [1e-5, 1e-6]:
+            for neox in [True, False]:
+                if RotaryEmbedding.enabled():
+                    for rope_flashinfer in [False, True]:
+                        QkNormRopePattern(
+                            head_dim=layer.head_size,
+                            num_heads=layer.num_heads,
+                            num_kv_heads=layer.num_kv_heads,
+                            eps=epsilon,
+                            is_neox=neox,
+                            rope_flashinfer=rope_flashinfer,
+                        ).register(self.patterns)
+                else:
+                    QkNormRopePattern(
+                        head_dim=layer.head_size,
+                        num_heads=layer.num_heads,
+                        num_kv_heads=layer.num_kv_heads,
+                        eps=epsilon,
+                        is_neox=neox,
+                    ).register(self.patterns)
+
+        self.dump_patterns(config, self.patterns)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph) -> None:
+        self.matched_count = self.patterns.apply(graph)
+        logger.debug("Fused QK Norm+RoPE on %s sites", self.matched_count)
+
+    def uuid(self):
+        return VllmInductorPass.hash_source(self, QkNormRopePattern)
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 31624a8fdcc0..6044e54f6f15 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -1,107 +1,40 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
+
 import torch
 import torch._inductor.pattern_matcher as pm
 import torch.fx as fx
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
 from vllm.config import VllmConfig
+from vllm.config.compilation import Range
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8StaticTensorSym,
+)
 from vllm.platforms import current_platform
 
 from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm
+from .noop_elimination import NoOpEliminationPass
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
 
 
-class _RMSNormAndQuantOpHelper:
-    """Base helper for RMSNorm and RMSNorm + Quantization functionalization."""
+def get_first_out_wrapper(fn):
+    @functools.wraps(fn)
+    def wrapper(*args):
+        return fn(*args)[0]
 
-    def __init__(
-        self,
-        epsilon: float,
-        dtype: torch.dtype,
-        device: str,
-        quant_op: torch._ops.OpOverload | None = None,
-        **kwargs,
-    ):
-        self.epsilon = epsilon
-        self.dtype = dtype
-        self.device = device
-        self.quant_op = quant_op
-
-    def _functional_rmsnorm(self, result_buffer, input_tensor, weight_tensor):
-        return torch.ops.higher_order.auto_functionalized(
-            torch.ops._C.rms_norm.default,
-            result=result_buffer,
-            input=input_tensor,
-            weight=weight_tensor,
-            epsilon=self.epsilon,
-        )
+    return wrapper
 
-    def _functional_fused_add_rmsnorm(
-        self, input_tensor, residual_tensor, weight_tensor
-    ):
-        return torch.ops.higher_order.auto_functionalized(
-            torch.ops._C.fused_add_rms_norm.default,
-            input=input_tensor,
-            residual=residual_tensor,
-            weight=weight_tensor,
-            epsilon=self.epsilon,
-        )
 
-    def _functional_rmsnorm_then_quant(
-        self,
-        rmsnorm_result_buffer,
-        quant_result_buffer,
-        input_tensor,
-        weight_tensor,
-        scale_tensor,
-    ):
-        if self.quant_op is None:
-            raise RuntimeError(
-                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
-            )
-        rmsnorm_out_tuple = self._functional_rmsnorm(
-            rmsnorm_result_buffer, input_tensor, weight_tensor
-        )
-        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
-            self.quant_op,
-            result=quant_result_buffer,
-            input=rmsnorm_out_tuple[1],
-            scale=scale_tensor,
-        )
-        return quant_out_tuple
-
-    def _functional_fused_add_rmsnorm_then_quant(
-        self,
-        quant_result_buffer,
-        input_tensor,
-        residual_tensor,
-        weight_tensor,
-        scale_tensor,
-    ):
-        if self.quant_op is None:
-            raise RuntimeError(
-                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
-            )
-        fused_add_rmsnorm_out_tuple = self._functional_fused_add_rmsnorm(
-            input_tensor, residual_tensor, weight_tensor
-        )
-        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
-            self.quant_op,
-            result=quant_result_buffer,
-            input=fused_add_rmsnorm_out_tuple[1],
-            scale=scale_tensor,
-        )
-        return quant_out_tuple, fused_add_rmsnorm_out_tuple[2]
-
-
-class _SequenceParallelPatternHelper(_RMSNormAndQuantOpHelper):
+class _SequenceParallelPatternHelper:
     """Helper for sequence parallelism patterns."""
 
     def __init__(
@@ -109,10 +42,10 @@ def __init__(
         epsilon: float,
         dtype: torch.dtype,
         device: str,
-        quant_op: torch._ops.OpOverload | None = None,
-        **kwargs,
     ):
-        super().__init__(epsilon, dtype, device, quant_op=quant_op, **kwargs)
+        self.epsilon = epsilon
+        self.dtype = dtype
+        self.device = device
         self.tp_group = get_tp_group()
         self.tp_size = get_tensor_model_parallel_world_size()
 
@@ -131,36 +64,34 @@ def _all_gather(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+
     def get_inputs(self):
         input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
-        permute = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
         arg3_1 = torch.empty([4], device=self.device, dtype=self.dtype)
 
-        return [input, permute, arg3_1]
+        return [input, arg3_1]
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
-            permute: torch.Tensor,
             arg3_1: torch.Tensor,
         ):
             all_reduce = self._all_reduce(input)
-            rmsnorm = self._functional_rmsnorm(permute, all_reduce, arg3_1)
+            rmsnorm = self.rmsnorm_matcher(all_reduce, arg3_1)
 
-            return rmsnorm[1], all_reduce
+            return rmsnorm, all_reduce
 
         def replacement(
             input: torch.Tensor,
-            permute: torch.Tensor,
             arg3_1: torch.Tensor,
         ):
             reduce_scatter = self._reduce_scatter(input)
 
-            rmsnorm_result = torch.empty_like(reduce_scatter)
-            rmsnorm = self._functional_rmsnorm(rmsnorm_result, reduce_scatter, arg3_1)
-
-            all_gather = self._all_gather(rmsnorm[1])
-
+            rmsnorm = self.rmsnorm_matcher(reduce_scatter, arg3_1)
+            all_gather = self._all_gather(rmsnorm)
             return all_gather, reduce_scatter
 
         pm.register_replacement(
@@ -169,6 +100,10 @@ def replacement(
 
 
 class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
+
     def get_inputs(self):
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
 
@@ -188,67 +123,34 @@ def pattern(
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                all_reduce, residual, rms_norm_weights
-            )
-            return rmsnorm[1], rmsnorm[2]
+            rmsnorm = self.rmsnorm_matcher(all_reduce, rms_norm_weights, residual)
+            return rmsnorm[0], rmsnorm[1]
 
         def replacement(
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
+            # pattern matcher replaces from top-to-bottom,
+            # so residual is still the full size here.
+            # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                reduce_scatter, residual, rms_norm_weights
-            )
-            all_gather = self._all_gather(rmsnorm[1])
-            return all_gather, rmsnorm[2]
+            residual = residual[0 : reduce_scatter.size(0), ...]
+            rmsnorm = self.rmsnorm_matcher(reduce_scatter, rms_norm_weights, residual)
+            all_gather = self._all_gather(rmsnorm[0])
+            # shape of residual changes but that's fine,
+            # next node is already slicing it, now becomes a noop
+            return all_gather, rmsnorm[1]
 
         pm.register_replacement(
             pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
-
-
-class LastAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
-    def get_inputs(self):
-        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        return [
-            residual,
-            mm_1,
-            rms_norm_weights,
-        ]
-
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                all_reduce, residual, rms_norm_weights
-            )
-            return rmsnorm[1]
-
-        def replacement(
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            reduce_scatter = self._reduce_scatter(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                reduce_scatter, residual, rms_norm_weights
-            )
-            normalized = self._all_gather(rmsnorm[1])
-            return normalized
-
         pm.register_replacement(
-            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+            get_first_out_wrapper(pattern),
+            get_first_out_wrapper(replacement),
+            self.get_inputs(),
+            pm.fwd_only,
+            pm_pass,
         )
 
 
@@ -257,52 +159,41 @@ def replacement(
 
 class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
     def __init__(
-        self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
     ):
-        super().__init__(epsilon, dtype, device, quant_op=op)
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self):
         input = torch.zeros([1, 8, 4], device=self.device, dtype=self.dtype)
-        rmsnorm_result = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
-        quant_result = torch.empty([1, 8, 4], device=self.device, dtype=FP8_DTYPE)
         weight = torch.empty([4], device=self.device, dtype=self.dtype)
         scale = torch.tensor(1.0, device=self.device, dtype=torch.float32)
-        return [input, rmsnorm_result, quant_result, weight, scale]
+        return [input, weight, scale]
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
-            quant_result: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             all_reduce = self._all_reduce(input)
-            static_fp8 = self._functional_rmsnorm_then_quant(
-                rmsnorm_result, quant_result, all_reduce, weight, scale
-            )
-            return static_fp8[1], all_reduce
+            rms = self.rmsnorm_matcher(all_reduce, weight)
+            quant, _ = self.quant_matcher(rms, scale)
+            return quant, all_reduce
 
         def replacement(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
-            quant_result: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             reduce_scatter = self._reduce_scatter(input)
-
-            rmsnorm_result = torch.empty_like(
-                reduce_scatter, dtype=rmsnorm_result.dtype
-            )
-            quant_result = torch.empty_like(
-                rmsnorm_result,  # Output of RMSNorm
-                dtype=quant_result.dtype,
-            )
-            static_fp8 = self._functional_rmsnorm_then_quant(
-                rmsnorm_result, quant_result, reduce_scatter, weight, scale
-            )
-            all_gather = self._all_gather(static_fp8[1])
+            rms = self.rmsnorm_matcher(reduce_scatter, weight)
+            quant, _ = self.quant_matcher(rms, scale)
+            all_gather = self._all_gather(quant)
 
             return all_gather, reduce_scatter
 
@@ -312,118 +203,64 @@ def replacement(
 
 
 class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-    def __init__(
-        self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload
-    ):
-        super().__init__(epsilon, dtype, device, quant_op=op)
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self):
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
         residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
         scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
 
-        return [
-            result,
-            residual,
-            mm_1,
-            rms_norm_weights,
-            scale,
-        ]
+        return [residual, mm_1, rms_norm_weights, scale]
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
-            result: torch.Tensor,
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            static_fp8, rmsnorm_residual_out = (
-                self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
-                    result, all_reduce, residual, rms_norm_weights, scale
-                )
+            rms, residual_out = self.rmsnorm_matcher(
+                all_reduce, rms_norm_weights, residual
             )
-            return static_fp8[1], rmsnorm_residual_out
+            quant, _ = self.quant_matcher(rms, scale)
+            return quant, residual_out
 
         def replacement(
-            result: torch.Tensor,
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
+            # pattern matcher replaces from top-to-bottom,
+            # so residual is still the full size here.
+            # add a temporary slice which will become a noop
+            # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
-            quant_result_buf = torch.empty_like(reduce_scatter, dtype=result.dtype)
-            static_fp8, rmsnorm_residual_out = (
-                self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
-                    quant_result_buf, reduce_scatter, residual, rms_norm_weights, scale
-                )
+            residual = residual[0 : reduce_scatter.size(0), ...]
+            rms, residual_out = self.rmsnorm_matcher(
+                reduce_scatter, rms_norm_weights, residual
             )
-            all_gather = self._all_gather(static_fp8[1])
-            return all_gather, rmsnorm_residual_out
+            quant, _ = self.quant_matcher(rms, scale)
+            all_gather = self._all_gather(quant)
+            # shape of residual changes but that's fine,
+            # next node is already slicing it, now becomes a noop
+            return all_gather, residual_out
 
         pm.register_replacement(
             pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
 
-
-class LastAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-    def __init__(
-        self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload
-    ):
-        super().__init__(epsilon, dtype, device, quant_op=op)
-
-    def get_inputs(self):
-        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
-        scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
-
-        return [
-            result,
-            residual,
-            mm_1,
-            rms_norm_weights,
-            scale,
-        ]
-
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(
-            result: torch.Tensor,
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-            scale: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            all_reduce = self._all_reduce(mm_1)
-            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
-                result, all_reduce, residual, rms_norm_weights, scale
-            )
-            return static_fp8[1]
-
-        def replacement(
-            result: torch.Tensor,
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-            scale: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            reduce_scatter = self._reduce_scatter(mm_1)
-            quant_result_buf = torch.empty_like(reduce_scatter, dtype=result.dtype)
-            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
-                quant_result_buf, reduce_scatter, residual, rms_norm_weights, scale
-            )
-            normalized = self._all_gather(static_fp8[1])
-            return normalized
-
         pm.register_replacement(
-            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+            get_first_out_wrapper(pattern),
+            get_first_out_wrapper(replacement),
+            self.get_inputs(),
+            pm.fwd_only,
+            pm_pass,
         )
 
 
@@ -445,27 +282,45 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     GEMM + ReduceScatter and AllGather + GEMM fusions. These fusions can
     significantly reduce communication overhead and improve overall model
     performance.
+
+
+    This pass splits up the residual tensor across TP ranks and hence divides its size.
+    Because the pattern matcher starts at the end of the graph, the replacement
+    contains a slice that temporarily conforms the input residual to the correct size.
+    After all patterns have been matched, we use a NoOpEliminationPass to clean up
+    what have now become no-op slices.
+
+    Note that an older version of the pass did not need this as it operated only on
+    custom rms_norm and fused_rms_norm_add custom ops which did not complain about
+    mismatched shapes during replacement. So this approach has the same assumption that
+    correctness is only maintained if all rms_norm operations are split across ranks.
+
+    Correctness-wise, this is approach strictly better than before - before,
+    the graph was incorrect semantically and shape-wise during the pass.
+    With this approach there's only semantic incorrectness during the pass.
+    Both approaches restore a correct graph once all patterns are matched.
     """
 
     @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
+        # Used to cleanup redundant views created temporarily
+        # to circumvent residual shape change issues
+        self.noop_cleanup = NoOpEliminationPass(config)
+        self.noop_cleanup.pass_name = f"{self.pass_name}.{self.noop_cleanup.pass_name}"
+
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="sequence_parallelism_pass"
         )
 
         for epsilon in [1e-5, 1e-6]:
             # RMSNorm + Static FP8 quantization patterns
-            fp8_quant_op = torch.ops._C.static_scaled_fp8_quant.default
             FirstAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device, fp8_quant_op
+                epsilon, self.model_dtype, self.device
             ).register(self.patterns)
             MiddleAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device, fp8_quant_op
-            ).register(self.patterns)
-            LastAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device, fp8_quant_op
+                epsilon, self.model_dtype, self.device
             ).register(self.patterns)
 
             # Normal RMSNorm patterns
@@ -477,12 +332,9 @@ def __init__(self, config: VllmConfig):
                 epsilon, self.model_dtype, self.device
             ).register(self.patterns)
 
-            LastAllReduceRMSNormPattern(
-                epsilon, self.model_dtype, self.device
-            ).register(self.patterns)
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable(self, shape: int | None) -> bool:
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
         # When sequence parallelism is enabled, the residual tensor from RMSNorm
         # needs to be split along the sequence dimension. However, this dimension
         # is symbolic during piecewise compilation, and splitting symbolic shapes
@@ -502,9 +354,11 @@ def is_applicable(self, shape: int | None) -> bool:
         ):
             return True
         tp_size = get_tensor_model_parallel_world_size()
-        return shape is not None and shape % tp_size == 0
+        return (compile_range.is_single_size()) and (compile_range.end % tp_size == 0)
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph):
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
+        # Clean up reshape nodes
+        self.noop_cleanup(graph)
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 4b10c85209f6..493e57f97f0f 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -4,11 +4,11 @@
 import os
 import sys
 from abc import abstractmethod
-from collections.abc import Callable
 from contextlib import contextmanager
 from types import CodeType
 
 import torch
+import torch._C._dynamo.guards
 
 import vllm.envs as envs
 from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
@@ -17,86 +17,153 @@
 logger = init_logger(__name__)
 
 
-class TorchCompileWrapperWithCustomDispatcher:
+def _noop_add_global_state_guard(self, *args, **kwargs):
+    """No-op to skip the GLOBAL_STATE guard entirely"""
+    pass
+
+
+def _noop_add_torch_function_mode_stack_guard(self, *args, **kwargs):
+    """No-op to skip the TORCH_FUNCTION_MODE_STACK guard entirely"""
+    pass
+
+
+@contextmanager
+def _compilation_context():
+    """Context manager for compilation settings and patches.
+
+    This manager:
+    1. Sets higher dynamo cache limits for compilation. (Needed for
+        qwen2_5_vl see test_qwen2_5_vl_evs_functionality).
+        Generally a recompilation can happen whenever we use a new
+        backend instance in torch.compile.
+    2. Patches out add_global_state_guard to skip GLOBAL_STATE guards
+    3. Patches out add_torch_function_mode_stack_guard to skip
+        TORCH_FUNCTION_MODE_STACK guards.
+    4. Restores everything when compilation completes
     """
-    A wrapper class for torch.compile, with a custom dispatch logic.
-    Subclasses should:
-    1. Implement the forward method
-    2. Implement the dispatch logic in the __call__ method
-        It can use `self.compiled_codes` to access the compiled bytecode,
-        and `with self.dispatch_to_code(index):` to dispatch to
-        the compiled code.
-    3. Implement the `__init__` method to determine how to call
-        `torch.compile` over the forward method.
+    # Save original values
+    original_global_state_guard = (
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard
+    )
+    original_torch_function_mode_stack_guard = (
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard
+    )
+    original_cache_size = torch._dynamo.config.cache_size_limit
+    original_accumulated_cache = torch._dynamo.config.accumulated_cache_size_limit
+
+    try:
+        # Set higher cache limits for compilation
+        torch._dynamo.config.cache_size_limit = 2048
+        torch._dynamo.config.accumulated_cache_size_limit = 8192
+
+        # Patch guard manager
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard = (
+            _noop_add_global_state_guard
+        )
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = (
+            _noop_add_torch_function_mode_stack_guard
+        )
+        yield
+    finally:
+        # Restore original values
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard = (
+            original_global_state_guard
+        )
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = (
+            original_torch_function_mode_stack_guard
+        )
+        torch._dynamo.config.cache_size_limit = original_cache_size
+        torch._dynamo.config.accumulated_cache_size_limit = original_accumulated_cache
+
+
+class TorchCompileWithNoGuardsWrapper:
     """
+    A wrapper class for torch.compile, it ensures that all guards are dropped
+    when CompilationMode is not CompilationMode.STOCK_TORCH_COMPILE.
+    When guards are dropped, the first time __call__ is invoked, a single
+    compilation is triggered. Dynamo should never be traced again after that
+    since we drop all guards.
+    """
+
+    def __init__(self):
+        self.compiled = False
 
-    def __init__(
-        self, compiled_callable: Callable | None = None, compilation_mode: int = 0
-    ):
         vllm_config = get_current_vllm_config()
         self.vllm_config = vllm_config
-        if compiled_callable is None:
-            # default compilation settings
-            # compiling the forward method
-
-            backend = vllm_config.compilation_config.init_backend(vllm_config)
-            options = None
-            if isinstance(backend, str) and backend == "inductor":
-                options = (
-                    get_current_vllm_config().compilation_config.inductor_compile_config
-                )
-            if envs.VLLM_USE_AOT_COMPILE:
-                options = options or {}
-                # This effectively drop all the guards.
-                # We need this because bytecode hook is not used any more to
-                # drop guards in the AOT compile mode.
-                options["guard_filter_fn"] = lambda guards: [False for _ in guards]
-                if hasattr(torch._dynamo.config, "enable_aot_compile"):
-                    torch._dynamo.config.enable_aot_compile = True
-                else:
-                    msg = "torch._dynamo.config.enable_aot_compile is not "
-                    msg += "available. AOT compile is disabled and please "
-                    msg += "upgrade PyTorch version to use AOT compile."
-                    logger.warning(msg)
-
-            compiled_callable = torch.compile(
-                self.forward, fullgraph=True, backend=backend, options=options
-            )
-
-        self.compiled_callable = compiled_callable
-        self.original_code_object = self.__class__.forward.__code__
-        self.compiled_codes: list[CodeType] = []
-        torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
-
-        # read the env var to determine whether to use the custom dispatcher
-        # subclasses can use this to switch between the custom dispatcher
-        # and the default Dynamo guard mechanism.
-        self.use_custom_dispatcher: bool = (
-            compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE
+        mode = vllm_config.compilation_config.mode
+        if mode is None:
+            raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
+
+        backend = vllm_config.compilation_config.init_backend(vllm_config)
+        options = {}
+
+        if isinstance(backend, str) and backend == "inductor":
+            options = vllm_config.compilation_config.inductor_compile_config
+
+        if mode != CompilationMode.STOCK_TORCH_COMPILE:
+            # Drop all the guards.
+            options["guard_filter_fn"] = lambda x: [False for _ in x]
+
+        if envs.VLLM_USE_AOT_COMPILE:
+            if hasattr(torch._dynamo.config, "enable_aot_compile"):
+                torch._dynamo.config.enable_aot_compile = True
+            else:
+                msg = "torch._dynamo.config.enable_aot_compile is not "
+                msg += "available. AOT compile is disabled and please "
+                msg += "upgrade PyTorch version to use AOT compile."
+                logger.warning(msg)
+
+        self._compiled_callable = torch.compile(
+            self.forward,
+            fullgraph=True,
+            dynamic=False,
+            backend=backend,
+            options=options,
         )
 
+        if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE:
+            torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
+            self._compiled_bytecode = None
+
     def aot_compile(self, *args, **kwargs):
-        if not hasattr(self.compiled_callable, "aot_compile"):
+        if not hasattr(self._compiled_callable, "aot_compile"):
             raise RuntimeError(
                 "aot_compile is not supported by the current configuration. "
                 + "Please make sure torch.compile is enabled with the latest "
                 + f"version of PyTorch (current using torch: {torch.__version__})"
             )
-        return self.compiled_callable.aot_compile((args, kwargs))
+        return self._compiled_callable.aot_compile((args, kwargs))
 
     def __call__(self, *args, **kwargs):
-        """Implement the dispatch logic here, beyond the torch.compile mode.
-        NOTE: this function can have additional arguments beyond the forward
-         method, for directly dispatching to the compiled code.
-        """
-        return self.compiled_callable(*args, **kwargs)
+        if envs.VLLM_USE_BYTECODE_HOOK:
+            if (
+                self.vllm_config.compilation_config.mode
+                == CompilationMode.STOCK_TORCH_COMPILE
+            ):
+                return self._compiled_callable(*args, **kwargs)
+
+            if not self._compiled_bytecode:
+                # Make sure a compilation is triggered by clearing dynamo
+                # cache.
+                torch._dynamo.eval_frame.remove_from_cache(self.original_code_object())
+                return self._compiled_callable(*args, **kwargs)
+            else:
+                with self._dispatch_to_compiled_code():
+                    return self.forward(*args, **kwargs)
+        else:
+            with _compilation_context():
+                return self._compiled_callable(*args, **kwargs)
 
     @abstractmethod
     def forward(self, *args, **kwargs): ...
 
+    def original_code_object(self) -> CodeType:
+        """Return the original code object of the forward method."""
+        return self.__class__.forward.__code__
+
     def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
         """Hook to save the compiled bytecode for direct execution."""
-        if old_code is not self.original_code_object:
+        if old_code is not self.original_code_object():
             return
         # code borrowed from https://github.com/thuml/depyf/blob/f4ad79fadee27ea113b4c75202db1eb1a11c0dbc/depyf/explain/enable_debugging.py#L25
         frame = sys._getframe()
@@ -112,7 +179,7 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
         if frame.f_locals["self"] is not self:
             return
 
-        self.compiled_codes.append(new_code)
+        self._compiled_bytecode = new_code
 
         path = self.vllm_config.compile_debug_dump_path()
         if path:
@@ -151,16 +218,21 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
             raise RuntimeError(msg)
 
     @contextmanager
-    def dispatch_to_code(self, index: int):
-        """Context manager to dispatch to the compiled code.
+    def _dispatch_to_compiled_code(self):
+        # noqa: E501
+        """
+        Context manager to dispatch to internally compiled code for torch<2.8.
         Why does this work? Because Dynamo guarantees that the compiled
         bytecode has exactly the same arguments, cell variables, and free
         variables as the original code. Therefore we can directly switch
         the code object in the function and call it.
 
-        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7
-        for more details.
-        """
-        self.__class__.forward.__code__ = self.compiled_codes[index]
-        yield
-        self.__class__.forward.__code__ = self.original_code_object
+        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
+        """  # noqa: E501 line too long
+        original = self.original_code_object()
+        assert self._compiled_bytecode is not None
+        self.__class__.forward.__code__ = self._compiled_bytecode
+        try:
+            yield
+        finally:
+            self.__class__.forward.__code__ = original
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 7f1cc5202420..dd76a722106e 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -9,6 +9,7 @@
     PassConfig,
 )
 from vllm.config.device import DeviceConfig
+from vllm.config.ec_transfer import ECTransferConfig
 from vllm.config.kv_events import KVEventsConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.config.load import LoadConfig
@@ -54,6 +55,8 @@
     "PassConfig",
     # From vllm.config.device
     "DeviceConfig",
+    # From vllm.config.ec_transfer
+    "ECTransferConfig",
     # From vllm.config.kv_events
     "KVEventsConfig",
     # From vllm.config.kv_transfer
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 031df3091f1c..864cf1be81b2 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -21,7 +21,15 @@
 logger = init_logger(__name__)
 
 BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
-CacheDType = Literal["auto", "bfloat16", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
+CacheDType = Literal[
+    "auto",
+    "bfloat16",
+    "fp8",
+    "fp8_e4m3",
+    "fp8_e5m2",
+    "fp8_inc",
+    "fp8_ds_mla",
+]
 MambaDType = Literal["auto", "float32"]
 PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
 KVOffloadingBackend = Literal["native", "lmcache"]
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 92cf16f259fe..dbe4cf542196 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -14,10 +14,11 @@
 
 import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
-from vllm.config.utils import config
+from vllm.config.utils import Range, config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.math_utils import round_up
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 if TYPE_CHECKING:
@@ -28,7 +29,7 @@
 logger = init_logger(__name__)
 
 
-class CompilationMode:
+class CompilationMode(enum.IntEnum):
     """The compilation approach used for torch.compile-based compilation of the
     model."""
 
@@ -115,7 +116,7 @@ class PassConfig:
     """The threshold of the communicated tensor sizes under which
     vllm should use flashinfer fused allreduce. Specified as a
     float in MB.
-    Unspecified will fallback to default values 
+    Unspecified will fallback to default values
     which are compute capability and world size dependent.
         FI_ALLREDUCE_FUSION_MAX_SIZE_MB = {
             90: {
@@ -129,6 +130,8 @@ class PassConfig:
                 8: 1,  # 1MB
             },
         }, where key is the device capability"""
+    enable_qk_norm_rope_fusion: bool = False
+    """Whether to enable the fused Q/K RMSNorm + RoPE pass."""
 
     # TODO(luka) better pass enabling system.
 
@@ -140,6 +143,9 @@ def flashinfer_max_size(self, world_size: int) -> int | None:
         """
 
         MiB = 1024 * 1024
+        FI_SUPPORTED_WORLD_SIZES = [2, 4, 8]
+        if world_size not in FI_SUPPORTED_WORLD_SIZES:
+            return None
         max_size_mb = self.fi_allreduce_fusion_max_size_mb
         if max_size_mb is None:
             max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size)
@@ -182,6 +188,12 @@ def __post_init__(self) -> None:
                     "Fusion enabled but reshape elimination disabled. "
                     "Allreduce + rms norm + quant (fp8) fusion might not work"
                 )
+        if self.enable_qk_norm_rope_fusion and not current_platform.is_cuda_alike():
+            logger.warning_once(
+                "QK Norm + RoPE fusion enabled but the current platform is not "
+                "CUDA or ROCm. The fusion will be disabled."
+            )
+            self.enable_qk_norm_rope_fusion = False
 
 
 @config
@@ -198,7 +210,6 @@ class CompilationConfig:
         - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
         - [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder]
     - CudaGraph capture:
-        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
         - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
         - [`cudagraph_capture_sizes`]
         [vllm.config.CompilationConfig.cudagraph_capture_sizes]
@@ -208,10 +219,11 @@ class CompilationConfig:
         [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
         - [`cudagraph_copy_inputs`]
         [vllm.config.CompilationConfig.cudagraph_copy_inputs]
-        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
     - Inductor compilation:
         - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
         - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
+        - [`compile_ranges_split_points`]
+            [vllm.config.CompilationConfig.compile_ranges_split_points]
         - [`inductor_compile_config`]
         [vllm.config.CompilationConfig.inductor_compile_config]
         - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
@@ -236,7 +248,7 @@ class CompilationConfig:
     Please use mode. Currently all levels are mapped to mode.
     """
     # Top-level Compilation control
-    mode: int | None = None
+    mode: CompilationMode | None = None
     """The compilation approach used for torch.compile-based compilation of the
     model.
 
@@ -314,9 +326,10 @@ class CompilationConfig:
 
     If None, defaults to attention ops for piecewise cudagraphs.
     If empty list [], no ops are excluded (suitable for full cudagraphs)."""
-    compile_mm_encoder: bool = True
+    compile_mm_encoder: bool = False
     """Whether or not to compile the multimodal encoder.
-    Currently, this only works for `Qwen2_5_vl`."""
+    Currently, this only works for `Qwen2_5_vl` on selected platforms. 
+    Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
     use_inductor: bool | None = None
@@ -341,6 +354,21 @@ class CompilationConfig:
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
     specify the sizes for cudagraph capture."""
+    compile_ranges_split_points: list[int] | None = None
+    """Split points that represent compile ranges for inductor.
+    The compile ranges are 
+    [1, split_points[0]], 
+    [split_points[0] + 1, split_points[1]], ..., 
+    [split_points[-1] + 1, max_num_batched_tokens].
+    Compile sizes are also used single element ranges,
+    the range is represented as [compile_sizes[i], compile_sizes[i]].
+    
+    If a range overlaps with the compile size, graph for compile size 
+    will be prioritized, i.e. if we have a range [1, 8] and a compile size 4,
+    graph for compile size 4 will be compiled and used instead of the graph
+    for range [1, 8].
+    """
+
     inductor_compile_config: dict = field(default_factory=dict)
     """Additional configurations for inductor.
     - None: use default configurations."""
@@ -369,36 +397,24 @@ class CompilationConfig:
     FULL mode: Capture full cudagraph for all batches. Can be good for small
     models or workloads with small prompts; not supported by many backends.
     Generally for performance FULL_AND_PIECEWISE is better.
-    
+
     FULL_DECODE_ONLY mode: Capture full cudagraph for decode batches only.
     Mixed prefill-decode batches are run without cudagraphs. Can be good for
     decode instances in a P/D setup where prefill is not as important so we
     can save some memory.
-    
+
     FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
     piecewise cudagraph for prefill and mixed prefill-decode batches.
     This is the most performant mode for most models and is the default.
 
     Currently, the cudagraph mode is only used for the v1 engine.
-    Note that the cudagraph logic is generally orthogonal to the 
-    compilation logic. While piecewise cudagraphs require piecewise 
+    Note that the cudagraph logic is generally orthogonal to the
+    compilation logic. While piecewise cudagraphs require piecewise
     compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full
     cudagraphs are supported with and without compilation.
-    
-    Warning: This flag is new and subject to change in addition 
-    more modes may be added.
-    """
-    use_cudagraph: bool = True
-    """Whether to use cudagraph inside compilation:
 
-    - False: cudagraph inside compilation is not used.\n
-    - True: cudagraph inside compilation is used. It requires
-        that all input buffers have fixed addresses, and all
-        splitting ops write their outputs to input buffers.
-
-    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=FULL_AND
-    _PIECEWISE instead.
+    Warning: This flag is new and subject to change in addition
+    more modes may be added.
     """
     cudagraph_num_of_warmups: int = 0
     """Number of warmup runs for cudagraph.
@@ -414,18 +430,9 @@ class CompilationConfig:
     cudagraph. If the caller can guarantee that the same input buffers
     are always used, it can set this to False. Otherwise, it should
     set this to True, and the compiler will copy the input to an
-    internally managed buffer. Default is False. 
+    internally managed buffer. Default is False.
     Note that this flag is only effective when cudagraph_mode is PIECEWISE.
     """
-    full_cuda_graph: bool | None = False
-    """whether to use a full cuda graph for the entire forward pass rather than
-    splitting certain operations such as attention into subgraphs. Thus this
-    flag cannot be used together with splitting_ops. This may provide
-    performance benefits for smaller models.
-    Warning: This flag is deprecated and will be removed in the next major or
-    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=
-    FULL_AND_PIECEWISE instead.
-    """
     cudagraph_specialize_lora: bool = True
     """Whether to create separate cuda graphs for cases with and without active
     LoRA adapters. When set to False, the LoRA-enabled cuda graph will be used
@@ -443,7 +450,7 @@ class CompilationConfig:
     outside the partition functions. For a graph with N cudagraph-unsafe ops
     (e.g., Attention), there would be N+1 partitions. To mark an op as
     cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when
-    register the custom op. 
+    register the custom op.
 
     This config supports both full cudagraph and piecewise cudagraph without
     compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper
@@ -460,8 +467,8 @@ class CompilationConfig:
 
     max_cudagraph_capture_size: int | None = field(default=None)
     """The maximum cudagraph capture size.
-    
-    If cudagraph_capture_sizes is specified, this will be set to the largest 
+
+    If cudagraph_capture_sizes is specified, this will be set to the largest
     size in that list (or checked for consistency if specified). If
     cudagraph_capture_sizes is not specified, the list of sizes is generated
     automatically following the pattern:
@@ -470,7 +477,7 @@ class CompilationConfig:
         range(256, max_cudagraph_capture_size + 1, 16))
 
     If not specified, max_cudagraph_capture_size is set to min(max_num_seqs*2,
-    512) by default. This voids OOM in tight memory scenarios with small 
+    512) by default. This voids OOM in tight memory scenarios with small
     max_num_seqs, and prevents capture of many large graphs (>512) that would
     greatly increase startup time with limited performance benefit.
     """
@@ -571,16 +578,43 @@ def __repr__(self) -> str:
 
     __str__ = __repr__
 
-    @field_validator("cudagraph_mode", mode="before")
+    @field_validator("mode", mode="before")
     @classmethod
-    def validate_cudagraph_mode_before(cls, value: Any) -> Any:
+    def validate_mode_before(cls, value: Any) -> Any:
         """
-        enable parse the `cudagraph_mode` enum type from string
+        Enable parsing the `mode` field from string mode names.
+        Accepts both integers (0-3) and string names, like NONE, STOCK_TORCH_COMPILE,
+        DYNAMO_TRACE_ONCE, VLLM_COMPILE.
         """
+        if isinstance(value, str):
+            # Convert string mode name to integer value
+            mode_name = value.upper()
+
+            if mode_name not in CompilationMode.__members__:
+                raise ValueError(
+                    f"Invalid compilation mode: {value}. "
+                    f"Valid modes are: {', '.join(CompilationMode.__members__.keys())}"
+                )
+
+            return CompilationMode[mode_name]
+        return value
+
+    @field_validator("cudagraph_mode", mode="before")
+    @classmethod
+    def validate_cudagraph_mode_before(cls, value: Any) -> Any:
+        """Enable parsing of the `cudagraph_mode` enum type from string."""
         if isinstance(value, str):
             return CUDAGraphMode[value.upper()]
         return value
 
+    @field_validator("pass_config", mode="before")
+    @classmethod
+    def validate_pass_config_before(cls, value: Any) -> Any:
+        """Enable parsing of the `pass_config` field from a dictionary."""
+        if isinstance(value, dict):
+            return PassConfig(**value)
+        return value
+
     @field_validator("compile_cache_save_format")
     @classmethod
     def validate_compile_cache_save_format(cls, value: str) -> str:
@@ -637,8 +671,10 @@ def __post_init__(self) -> None:
                 func if isinstance(func, InductorPass) else CallableInductorPass(func)
             )
 
-        if isinstance(self.pass_config, dict):
-            self.pass_config = PassConfig(**self.pass_config)
+        if self.pass_config.enable_qk_norm_rope_fusion:
+            # TODO(zhuhaoran): support rope native forward match and remove this.
+            # Linked issue: https://github.com/vllm-project/vllm/issues/28042
+            self.custom_ops.append("+rotary_embedding")
 
         if (
             is_torch_equal_or_newer("2.9.0.dev")
@@ -650,36 +686,6 @@ def __post_init__(self) -> None:
             self.inductor_compile_config["combo_kernels"] = True
             self.inductor_compile_config["benchmark_combo_kernel"] = True
 
-        # migrate the deprecated flags
-        if not self.use_cudagraph:
-            logger.warning(
-                "use_cudagraph is deprecated, use cudagraph_mode=NONE instead."
-            )
-            if (
-                self.cudagraph_mode is not None
-                and self.cudagraph_mode != CUDAGraphMode.NONE
-            ):
-                raise ValueError(
-                    "use_cudagraph and cudagraph_mode are mutually"
-                    " exclusive, prefer cudagraph_mode since "
-                    "use_cudagraph is deprecated."
-                )
-            self.cudagraph_mode = CUDAGraphMode.NONE
-        if self.full_cuda_graph:
-            logger.warning(
-                "full_cuda_graph is deprecated, use cudagraph_mode=FULL instead."
-            )
-            if (
-                self.cudagraph_mode is not None
-                and not self.cudagraph_mode.has_full_cudagraphs()
-            ):
-                raise ValueError(
-                    "full_cuda_graph and cudagraph_mode are "
-                    "mutually exclusive, prefer cudagraph_mode "
-                    "since full_cuda_graph is deprecated."
-                )
-            self.cudagraph_mode = CUDAGraphMode.FULL
-
         if self.use_inductor_graph_partition and not is_torch_equal_or_newer(
             "2.9.0.dev"
         ):
@@ -788,19 +794,8 @@ def post_init_cudagraph_sizes(self) -> None:
         if self.cudagraph_capture_sizes:
             assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
 
-        # pre-compute the mapping from batch size to padded graph size
-        self.bs_to_padded_graph_size = [
-            0 for i in range(self.max_cudagraph_capture_size + 1)
-        ]
-        for end, start in zip(
-            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
-            [0] + self.cudagraph_capture_sizes,
-        ):
-            for bs in range(start, end):
-                if bs == start:
-                    self.bs_to_padded_graph_size[bs] = start
-                else:
-                    self.bs_to_padded_graph_size[bs] = end
+        # May get recomputed in the model runner if adjustment is needed for spec-decode
+        self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(self):
         # NOTE: this function needs to be called only when mode is
@@ -857,20 +852,19 @@ def set_splitting_ops_for_inductor_graph_partition(self):
 
     def set_splitting_ops_for_attn_fusion(self):
         assert self.pass_config.enable_attn_fusion
-        # For dynamo-partition (non-inductor) attention fusion,
-        # set splitting_ops to empty to avoid splitting at attention ops
-        self.splitting_ops = []
-        if self.cudagraph_mode.has_piecewise_cudagraphs():
-            logger.warning_once(
-                "enable_attn_fusion is incompatible with piecewise "
-                "cudagraph when use_inductor_graph_partition is off. "
-                "In this case, splitting_ops will be set to empty "
-                "list, and cudagraph_mode will be set to FULL. "
-                "Please ensure you are using attention backends that "
-                "support cudagraph or set cudagraph_mode to NONE "
-                "explicitly if encountering any problems."
-            )
-            self.cudagraph_mode = CUDAGraphMode.FULL
+        if self.splitting_ops is None:
+            self.splitting_ops = []
+            if self.cudagraph_mode.has_piecewise_cudagraphs():
+                logger.warning_once(
+                    "enable_attn_fusion is incompatible with piecewise "
+                    "cudagraph when use_inductor_graph_partition is off. "
+                    "In this case, splitting_ops will be set to empty "
+                    "list, and cudagraph_mode will be set to FULL. "
+                    "Please ensure you are using attention backends that "
+                    "support cudagraph or set cudagraph_mode to NONE "
+                    "explicitly if encountering any problems."
+                )
+                self.cudagraph_mode = CUDAGraphMode.FULL
 
         assert not self.splitting_ops_contain_attention(), (
             "attention ops should not be in splitting_ops "
@@ -891,7 +885,7 @@ def is_attention_compiled_piecewise(self) -> bool:
             return self.mode == CompilationMode.VLLM_COMPILE
 
         # Inductor partition case
-        return self.backend == "inductor" and self.mode > CompilationMode.NONE
+        return self.backend == "inductor" and self.mode != CompilationMode.NONE
 
     def custom_op_log_check(self):
         """
@@ -938,3 +932,77 @@ def custom_op_log_check(self):
                     enable_str,
                     op,
                 )
+
+    def adjust_cudagraph_sizes_for_spec_decode(
+        self, uniform_decode_query_len: int, tensor_parallel_size: int
+    ):
+        multiple_of = uniform_decode_query_len
+        if tensor_parallel_size > 1:
+            multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
+            if (
+                multiple_of % uniform_decode_query_len != 0
+                or multiple_of % tensor_parallel_size != 0
+            ):
+                raise ValueError(
+                    f"Can't determine cudagraph shapes that are both a "
+                    f"multiple of {uniform_decode_query_len} "
+                    f"(num_speculative_tokens + 1) required by spec-decode "
+                    f"and {tensor_parallel_size} (tensor_parallel_size) "
+                    f"required by sequence parallelism please adjust "
+                    f"num_speculative_tokens or disable sequence parallelism"
+                )
+
+        if not self.cudagraph_capture_sizes or multiple_of <= 1:
+            return
+
+        assert self.max_cudagraph_capture_size is not None
+        rounded_sizes = sorted(
+            set(
+                round_up(size, multiple_of)
+                for size in self.cudagraph_capture_sizes
+                if round_up(size, multiple_of) <= self.max_cudagraph_capture_size
+            )
+        )
+
+        if len(rounded_sizes) == 0:
+            logger.warning(
+                "No valid cudagraph sizes after rounding to multiple of "
+                " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens"
+                " or max_cudagraph_capture_size (or cudagraph_capture_sizes)",
+                multiple_of,
+            )
+            return
+
+        self.max_cudagraph_capture_size = rounded_sizes[-1]
+        self.cudagraph_capture_sizes = rounded_sizes
+
+        # Recompute after adjusting the cudagraph sizes
+        self.compute_bs_to_padded_graph_size()
+
+    def compute_bs_to_padded_graph_size(self):
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_cudagraph_capture_size + 1)
+        ]
+        for end, start in zip(
+            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
+            [0] + self.cudagraph_capture_sizes,
+        ):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
+
+    def get_compile_ranges(self) -> list[Range]:
+        """Get the compile ranges for the compilation config."""
+        if self.compile_ranges_split_points is None:
+            return []
+        split_points = sorted(set(self.compile_ranges_split_points))
+        compile_ranges = []
+        for i, s in enumerate(split_points):
+            if i == 0:
+                compile_ranges.append(Range(start=1, end=s))
+            else:
+                compile_ranges.append(Range(start=split_points[i - 1] + 1, end=s))
+        return compile_ranges
diff --git a/vllm/config/ec_transfer.py b/vllm/config/ec_transfer.py
new file mode 100644
index 000000000000..d95236f818ab
--- /dev/null
+++ b/vllm/config/ec_transfer.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import uuid
+from dataclasses import field
+from typing import Any, Literal, get_args
+
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+
+ECProducer = Literal["ec_producer"]
+ECConsumer = Literal["ec_consumer"]
+ECRole = Literal[ECProducer, ECConsumer]
+
+
+@config
+@dataclass
+class ECTransferConfig:
+    """Configuration for distributed EC cache transfer."""
+
+    ec_connector: str | None = None
+    """The EC connector for vLLM to transmit EC caches between vLLM instances.
+    """
+
+    engine_id: str | None = None
+    """The engine id for EC transfers."""
+
+    ec_buffer_device: str | None = "cuda"
+    """The device used by ec connector to buffer the EC cache.
+    Currently only support 'cuda'."""
+
+    ec_buffer_size: float = 1e9
+    """The buffer size for TorchDistributedConnector. Measured in number of
+    bytes. Recommended value: 1e9 (about 1GB)."""
+
+    ec_role: ECRole | None = None
+    """Whether this vLLM instance produces, consumes EC cache, or both. Choices
+    are 'ec_producer', 'ec_consumer'."""
+
+    ec_rank: int | None = None
+    """The rank of this vLLM instance in the EC cache transfer. Typical value:
+    0 for encoder, 1 for pd instance.
+    Currently only 1P1D is supported."""
+
+    ec_parallel_size: int = 1
+    """The number of parallel instances for EC cache transfer. For
+    PyNcclConnector, this should be 2."""
+
+    ec_ip: str = "127.0.0.1"
+    """The EC connector ip, used to build distributed connection."""
+
+    ec_port: int = 14579
+    """The EC connector port, used to build distributed connection."""
+
+    ec_connector_extra_config: dict[str, Any] = field(default_factory=dict)
+    """any extra config that the connector may need."""
+
+    ec_connector_module_path: str | None = None
+    """The Python module path to dynamically load the EC connector from.
+    Only supported in V1."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        if self.engine_id is None:
+            self.engine_id = str(uuid.uuid4())
+
+        if self.ec_role is not None and self.ec_role not in get_args(ECRole):
+            raise ValueError(
+                f"Unsupported ec_role: {self.ec_role}. "
+                f"Supported roles are {get_args(ECRole)}"
+            )
+
+        if self.ec_connector is not None and self.ec_role is None:
+            raise ValueError(
+                "Please specify ec_role when ec_connector "
+                f"is set, supported roles are {get_args(ECRole)}"
+            )
+
+    @property
+    def is_ec_transfer_instance(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECRole)
+
+    @property
+    def is_ec_producer(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECProducer)
+
+    @property
+    def is_ec_consumer(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECConsumer)
+
+    def get_from_extra_config(self, key, default) -> Any:
+        return self.ec_connector_extra_config.get(key, default)
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 44c044c76168..49fe0bcd9a2a 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -33,10 +33,14 @@
     try_get_generation_config,
     try_get_safetensors_metadata,
     try_get_tokenizer_config,
+    uses_custom_attention_masks,
     uses_mrope,
 )
+from vllm.transformers_utils.gguf_utils import (
+    maybe_patch_hf_config_from_gguf,
+)
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import maybe_model_redirect
+from vllm.transformers_utils.utils import check_gguf_file, maybe_model_redirect
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.torch_utils import common_broadcastable_dtype
 
@@ -45,7 +49,7 @@
 
     import vllm.model_executor.layers.quantization as me_quant
     import vllm.model_executor.models as me_models
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config.load import LoadConfig
     from vllm.config.parallel import ParallelConfig
     from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -53,7 +57,7 @@
 else:
     PretrainedConfig = Any
 
-    _Backend = Any
+    AttentionBackendEnum = Any
     me_quant = LazyLoader(
         "model_executor", globals(), "vllm.model_executor.layers.quantization"
     )
@@ -264,7 +268,8 @@ class ModelConfig:
     merged with the default config from the model. If used with
     `--generation-config vllm`, only the override parameters are used."""
     enable_sleep_mode: bool = False
-    """Enable sleep mode for the engine (only cuda platform is supported)."""
+    """Enable sleep mode for the engine (only cuda and
+    hip platforms are supported)."""
     model_impl: str | ModelImpl = "auto"
     """Which implementation of the model to use:\n
     - "auto" will try to use the vLLM implementation, if it exists, and fall
@@ -302,7 +307,7 @@ class ModelConfig:
     mm_processor_cache_type: InitVar[MMCacheType | None] = None
     mm_shm_cache_max_object_size_mb: InitVar[int | None] = None
     mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None
-    mm_encoder_attn_backend: InitVar[_Backend | str | None] = None
+    mm_encoder_attn_backend: InitVar[AttentionBackendEnum | str | None] = None
     interleave_mm_strings: InitVar[bool | None] = None
     skip_mm_profiling: InitVar[bool | None] = None
     video_pruning_rate: InitVar[float | None] = None
@@ -420,7 +425,7 @@ def __post_init__(
         mm_processor_cache_type: MMCacheType | None,
         mm_shm_cache_max_object_size_mb: int | None,
         mm_encoder_tp_mode: MMEncoderTPMode | None,
-        mm_encoder_attn_backend: _Backend | str | None,
+        mm_encoder_attn_backend: AttentionBackendEnum | str | None,
         interleave_mm_strings: bool | None,
         skip_mm_profiling: bool | None,
         video_pruning_rate: float | None,
@@ -449,6 +454,12 @@ def __post_init__(
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
+            if check_gguf_file(self.model):
+                raise ValueError(
+                    "Using a tokenizer is mandatory when loading a GGUF model. "
+                    "Please specify the tokenizer path or name using the "
+                    "--tokenizer argument."
+                )
             self.tokenizer = self.model
         if self.tokenizer_revision is None:
             self.tokenizer_revision = self.revision
@@ -507,6 +518,10 @@ def __post_init__(
             hf_overrides_kw=hf_overrides_kw,
             hf_overrides_fn=hf_overrides_fn,
         )
+        hf_config = maybe_patch_hf_config_from_gguf(
+            self.model,
+            hf_config,
+        )
 
         self.hf_config = hf_config
         if dict_overrides:
@@ -731,7 +746,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
         return self
 
     def _get_transformers_backend_cls(self) -> str:
-        """Determine which Transformers backend class will be used if
+        """Determine which Transformers modeling backend class will be used if
         `model_impl` is set to `transformers` or `auto`."""
         cls = "Transformers"
         # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal
@@ -745,8 +760,8 @@ def _get_transformers_backend_cls(self) -> str:
         # User specified value take precedence
         if self.runner != "auto":
             runner = self.runner
-        # Only consider Transformers backend pooling classes if we're wrapping an
-        # architecture that defaults to pooling. Otherwise, we return the LM class
+        # Only consider Transformers modeling backend pooling classes if we're wrapping
+        # an architecture that defaults to pooling. Otherwise, we return the LM class
         # and use adapters.
         if runner == "pooling" and task in {"embed", "classify"}:
             if task == "embed":
@@ -758,7 +773,7 @@ def _get_transformers_backend_cls(self) -> str:
         return cls
 
     def using_transformers_backend(self) -> bool:
-        """Check if the model is using the Transformers backend class."""
+        """Check if the model is using the Transformers modeling backend class."""
         used_cls = self._model_info.architecture
         transformers_backend_cls = self._get_transformers_backend_cls()
         return used_cls == transformers_backend_cls
@@ -1182,6 +1197,14 @@ def verify_with_parallel_config(
                 f"but got {decode_context_parallel_size}"
             )
 
+            num_q_per_kv = total_num_attention_heads // total_num_kv_heads
+            assert num_q_per_kv % decode_context_parallel_size == 0, (
+                f"Total number of q per kv attn heads ({num_q_per_kv})"
+                " must be divisible by dcp world size when enable "
+                "decode context parallel for GQA "
+                f"({parallel_config.decode_context_parallel_size})."
+            )
+
     def get_sliding_window(self) -> int | None:
         """Get the sliding window size from the HF text config if present."""
         return getattr(self.hf_text_config, "sliding_window", None)
@@ -1341,7 +1364,8 @@ def get_num_experts(self) -> int:
             # Ernie VL's remote code uses list[int]...
             # The values are always the same so we just take the first one.
             return num_experts[0]
-        return num_experts
+        # Coerce to 0 if explicitly set to None
+        return num_experts or 0
 
     def get_layers_start_end_indices(
         self, parallel_config: ParallelConfig
@@ -1595,6 +1619,10 @@ def uses_alibi(self) -> bool:
     def uses_mrope(self) -> bool:
         return uses_mrope(self.hf_config)
 
+    @property
+    def uses_custom_attention_masks(self) -> bool:
+        return uses_custom_attention_masks(self.hf_config)
+
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
@@ -1619,6 +1647,13 @@ def is_attention_free(self) -> bool:
 
     @property
     def is_hybrid(self) -> bool:
+        # Handle granite-4.0-micro case which uses hybrid config but does not
+        # actually contain any non-attention layers.
+        layer_types = getattr(self.hf_config, "layer_types", None)
+        if layer_types is not None and all(
+            layer == "attention" for layer in layer_types
+        ):
+            return False
         return self._model_info.is_hybrid
 
     @property
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index ef73720efe09..9f62b35ed515 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -11,9 +11,9 @@
 from vllm.config.utils import config
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
 else:
-    _Backend = Any
+    AttentionBackendEnum = Any
 
 
 @dataclass
@@ -125,10 +125,10 @@ class MultiModalConfig:
         DP (which is controlled by `--data-parallel-size`).
         This is only supported on a per-model basis and falls back to
         `"weights"` if the encoder does not support DP."""
-    mm_encoder_attn_backend: _Backend | None = None
+    mm_encoder_attn_backend: AttentionBackendEnum | None = None
     """Optional override for the multi-modal encoder attention backend when
     using vision transformers. Accepts any value from
-    `vllm.attention.backends.registry._Backend` (e.g. `FLASH_ATTN`)."""
+    `vllm.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
     interleave_mm_strings: bool = False
     """Enable fully interleaved support for multimodal prompts, while using
     --chat-template-content-format=string."""
@@ -167,26 +167,19 @@ def _validate_limit_per_prompt(
 
     @field_validator("mm_encoder_attn_backend", mode="before")
     @classmethod
-    def _validate_mm_encoder_attn_backend(cls, value: object) -> _Backend | None:
-        from vllm.attention.backends.registry import (
-            _Backend as BackendEnum,
-        )
-        from vllm.attention.backends.registry import (
-            backend_name_to_enum,
-        )
+    def _validate_mm_encoder_attn_backend(
+        cls, value: str | AttentionBackendEnum | None
+    ) -> AttentionBackendEnum | None:
+        # We need to import the real type here (deferred to avoid circular import).
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
-        if value is None or isinstance(value, BackendEnum):
+        if value is None or isinstance(value, AttentionBackendEnum):
             return value
 
-        if isinstance(value, str):
-            candidate = backend_name_to_enum(value.upper())
-            if candidate is not None:
-                return candidate
-
-        valid_backends = ", ".join(sorted(BackendEnum.__members__.keys()))
-        raise ValueError(
-            f"Invalid mm encoder attention backend. Expected one of: {valid_backends}."
+        assert isinstance(value, str), (
+            "mm_encoder_attn_backend must be a string or an AttentionBackendEnum."
         )
+        return AttentionBackendEnum[value.upper()]
 
     @model_validator(mode="after")
     def _validate_multimodal_config(self):
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index b19c8beeae3d..9a6326d62e82 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -210,6 +210,18 @@ class ParallelConfig:
     class is dynamically inherited by the worker class. This is used to inject
     new attributes and methods to the worker class for use in collective_rpc
     calls."""
+    master_addr: str = "127.0.0.1"
+    """distributed master address for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    master_port: int = 29501
+    """distributed master port for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    node_rank: int = 0
+    """distributed node rank for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    nnodes: int = 1
+    """num of nodes for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
 
     world_size: int = Field(init=False)
     """world_size is TPxPP, it affects the number of workers we create."""
@@ -278,10 +290,10 @@ def _validate_parallel_config(self) -> Self:
             )
 
         if self.enable_eplb:
-            if not current_platform.is_cuda():
+            if not current_platform.is_cuda_alike():
                 raise ValueError(
                     "Expert parallelism load balancing is only supported on "
-                    "CUDA devices now."
+                    "CUDA devices or ROCm devices now."
                 )
             if not self.enable_expert_parallel:
                 raise ValueError("enable_expert_parallel must be True to use EPLB.")
@@ -387,6 +399,23 @@ def use_sequence_parallel_moe(self) -> bool:
             and self.data_parallel_size > 1
         )
 
+    @property
+    def node_rank_within_dp(self) -> int:
+        return self.node_rank % self.nnodes_within_dp
+
+    @property
+    def nnodes_within_dp(self) -> int:
+        if self.nnodes == 1:
+            return 1
+        data_parallel_node_size = (
+            self.data_parallel_size // self.data_parallel_size_local
+        )
+        return self.nnodes // data_parallel_node_size
+
+    @property
+    def local_world_size(self) -> int:
+        return self.world_size // self.nnodes_within_dp
+
     @staticmethod
     def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
         tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
@@ -528,6 +557,8 @@ def __post_init__(self) -> None:
             ray_found = ray_utils.ray_is_available()
             if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
                 backend = "uni"
+            elif current_platform.is_cuda() and self.nnodes > 1:
+                backend = "mp"
             elif (
                 current_platform.is_cuda()
                 and cuda_device_count_stateless() < self.world_size
@@ -565,6 +596,10 @@ def __post_init__(self) -> None:
                 "max_parallel_loading_workers is currently "
                 "not supported and will be ignored."
             )
+        if self.distributed_executor_backend != "mp" and self.nnodes > 1:
+            raise ValueError(
+                "nnodes > 1 can only be set when distributed exectuor backend is mp."
+            )
 
     @property
     def use_ray(self) -> bool:
@@ -607,6 +642,11 @@ def _verify_args(self) -> Self:
                 "Disabled the custom all-reduce kernel because it is not "
                 "supported on current platform."
             )
+        if self.nnodes > 1:
+            self.disable_custom_all_reduce = True
+            logger.debug(
+                "Disabled the custom all-reduce since we are running on multi-node."
+            )
         if self.ray_workers_use_nsight and not self.use_ray:
             raise ValueError(
                 "Unable to use nsight profiling unless workers run with Ray."
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 47aa343527b3..8194295ffedb 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -4,19 +4,14 @@
 import hashlib
 from collections.abc import Callable
 from dataclasses import InitVar
-from typing import TYPE_CHECKING, Any, Literal, cast
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
 
-from pydantic import Field, field_validator, model_validator
+from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
-from typing_extensions import Self
+from typing_extensions import Self, deprecated
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import (
-    DEFAULT_MAX_NUM_BATCHED_TOKENS,
-    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-)
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 if TYPE_CHECKING:
@@ -33,25 +28,25 @@
 class SchedulerConfig:
     """Scheduler configuration."""
 
+    DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
+    DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
+
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: int = Field(default=None, ge=1)
+    max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1)
     """Maximum number of tokens to be processed in a single iteration.
 
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
-    max_num_seqs: int = Field(default=None, ge=1)
+    max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1)
     """Maximum number of sequences to be processed in a single iteration.
 
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
-
-    max_model_len: int = Field(default=None, ge=1)
-    """Maximum length of a sequence (including prompt and generated text). This
-    is primarily set in `ModelConfig` and that value should be manually
-    duplicated here."""
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
     max_num_partial_prefills: int = Field(default=1, ge=1)
     """For chunked prefill, the maximum number of sequences that can be
@@ -76,13 +71,23 @@ class SchedulerConfig:
     NOTE: This will be replaced by speculative config in the future; it is
     present to enable correctness tests until then."""
 
-    enable_chunked_prefill: bool = Field(default=None)
+    enable_chunked_prefill: bool = True
     """If True, prefill requests can be chunked based
-    on the remaining max_num_batched_tokens."""
+    on the remaining `max_num_batched_tokens`.
+
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
 
+    max_model_len: InitVar[int] = 8192
+    """Maximum length of a sequence (including prompt and generated text).
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    provide fallbacks and validate other attributes."""
+
     is_encoder_decoder: InitVar[bool] = False
     """True if the model is an encoder-decoder model.
 
@@ -111,9 +116,6 @@ class SchedulerConfig:
     - "priority" means requests are handled based on given priority (lower
     value means earlier handling) and time of arrival deciding any ties)."""
 
-    chunked_prefill_enabled: bool = Field(init=False)
-    """True if chunked prefill is enabled."""
-
     disable_chunked_mm_input: bool = False
     """If set to true and chunked prefill is enabled, we do not want to
     partially schedule a multimodal item. Only used in V1
@@ -142,6 +144,12 @@ class SchedulerConfig:
     speculative decoding and pipeline parallelism.
     """
 
+    stream_interval: int = Field(default=1, ge=1)
+    """The interval (or buffer size) for streaming in terms of token length.
+    A smaller value (1) makes streaming smoother by sending each token immediately,
+    while a larger value (e.g., 10) reduces host overhead and may increase throughput
+    by batching multiple tokens before sending."""
+
     def get_scheduler_cls(self) -> type["SchedulerInterface"]:
         if self.scheduler_cls is None:
             if self.async_scheduling:
@@ -182,15 +190,7 @@ def compute_hash(self) -> str:
         hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    @field_validator(
-        "max_num_batched_tokens",
-        "max_num_seqs",
-        "max_model_len",
-        "enable_chunked_prefill",
-        "scheduler_cls",
-        "async_scheduling",
-        mode="wrap",
-    )
+    @field_validator("scheduler_cls", "async_scheduling", mode="wrap")
     @classmethod
     def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
         """Skip validation if the value is `None` when initialisation is delayed."""
@@ -198,17 +198,10 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
             return value
         return handler(value)
 
-    def __post_init__(self, is_encoder_decoder: bool) -> None:
-        if self.max_model_len is None:
-            self.max_model_len = 8192
-
-        if self.max_num_seqs is None:
-            self.max_num_seqs = 128
-
+    def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
         if is_encoder_decoder:
             # Chunked prefill should be disabled for encoder-decoder models.
             self.disable_chunked_mm_input = True
-            self.chunked_prefill_enabled = False
             self.enable_chunked_prefill = False
             self.long_prefill_token_threshold = 0
             logger.info(
@@ -216,37 +209,6 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 " prefix caching; disabling both."
             )
 
-        if self.max_num_batched_tokens is None:
-            if self.enable_chunked_prefill:
-                self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
-            else:
-                # If max_model_len is too short, use
-                # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
-                # for higher throughput.
-                self.max_num_batched_tokens = max(
-                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS
-                )
-
-            if self.runner_type == "pooling":
-                # Choose specific value for higher throughput
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-            if self.is_multimodal_model:
-                # The value needs to be at least the number of multimodal tokens
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-
-            # When using default settings,
-            # Ensure max_num_batched_tokens does not exceed model limit.
-            # Some models (e.g., Whisper) have embeddings tied to max length.
-            self.max_num_batched_tokens = min(
-                self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens
-            )
-
         self.max_num_encoder_input_tokens = self.max_num_batched_tokens
         self.encoder_cache_size = self.max_num_batched_tokens
 
@@ -256,10 +218,9 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.max_num_batched_tokens,
             )
 
-        self.chunked_prefill_enabled = self.enable_chunked_prefill
         if self.max_num_partial_prefills > 1:
             if self.long_prefill_token_threshold == 0:
-                self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
+                self.long_prefill_token_threshold = int(max_model_len * 0.04)
 
             logger.info(
                 "Concurrent partial prefills enabled with "
@@ -270,15 +231,29 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.long_prefill_token_threshold,
             )
 
-    @model_validator(mode="after")
-    def _verify_args(self) -> Self:
+        self.verify_max_model_len(max_model_len)
+
+    @property
+    @deprecated(
+        "`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
+        "`SchedulerConfig.enable_chunked_prefill`. "
+        "The old name will be removed in v0.12."
+    )
+    def chunked_prefill_enabled(self) -> bool:
+        return self.enable_chunked_prefill
+
+    @chunked_prefill_enabled.setter
+    def chunked_prefill_enabled(self, value: bool):
+        self.enable_chunked_prefill = value
+
+    def verify_max_model_len(self, max_model_len: int) -> Self:
         if (
-            self.max_num_batched_tokens < self.max_model_len
-            and not self.chunked_prefill_enabled
+            self.max_num_batched_tokens < max_model_len
+            and not self.enable_chunked_prefill
         ):
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
-                f"smaller than max_model_len ({self.max_model_len}). "
+                f"smaller than max_model_len ({max_model_len}). "
                 "This effectively limits the maximum sequence length to "
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
@@ -292,26 +267,26 @@ def _verify_args(self) -> Self:
                 f"({self.max_num_seqs})."
             )
 
-        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
+        if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
             logger.warning(
                 "max_num_batched_tokens (%d) exceeds max_num_seqs "
                 "* max_model_len (%d). This may lead to unexpected behavior.",
                 self.max_num_batched_tokens,
-                self.max_num_seqs * self.max_model_len,
+                self.max_num_seqs * max_model_len,
             )
 
         if self.max_num_partial_prefills > 1:
-            if not self.chunked_prefill_enabled:
+            if not self.enable_chunked_prefill:
                 raise ValueError(
                     "Chunked prefill must be enabled to set "
                     "max_num_partial_prefills > 1."
                 )
 
-            if self.long_prefill_token_threshold > self.max_model_len:
+            if self.long_prefill_token_threshold > max_model_len:
                 raise ValueError(
                     "long_prefill_token_threshold "
                     f"({self.long_prefill_token_threshold}) cannot be greater "
-                    f"than the max_model_len ({self.max_model_len})."
+                    f"than the max_model_len ({max_model_len})."
                 )
 
         if self.max_long_partial_prefills > self.max_num_partial_prefills:
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 31cdeabe501d..13a8632413d9 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -3,7 +3,7 @@
 
 import ast
 import hashlib
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, get_args
 
 from pydantic import Field, SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
@@ -29,31 +29,25 @@
 
 logger = init_logger(__name__)
 
-SpeculativeMethod = Literal[
-    "ngram",
-    "eagle",
-    "eagle3",
-    "medusa",
-    "mlp_speculator",
-    "draft_model",
-    "deepseek_mtp",
-    "ernie_mtp",
-    "qwen3_next_mtp",
-    "mimo_mtp",
-    "longcat_flash_mtp",
-    "pangu_ultra_moe_mtp",
-    "mtp",
-    "suffix",
-]
-MTP_MODEL_TYPES = (
+MTPModelTypes = Literal[
     "deepseek_mtp",
     "mimo_mtp",
     "glm4_moe_mtp",
     "ernie_mtp",
     "qwen3_next_mtp",
     "longcat_flash_mtp",
+    "mtp",
     "pangu_ultra_moe_mtp",
-)
+]
+EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
+SpeculativeMethod = Literal[
+    "ngram",
+    "medusa",
+    "mlp_speculator",
+    "draft_model",
+    "suffix",
+    EagleModelTypes,
+]
 
 
 @config
@@ -244,7 +238,7 @@ def __post_init__(self):
         # can not be detected, it will be considered as the "draft_model" by
         # default.
 
-        if self.method in MTP_MODEL_TYPES:
+        if self.method in get_args(MTPModelTypes) and self.method != "mtp":
             logger.warning(
                 "method `%s` is deprecated and replaced with mtp.", self.method
             )
@@ -361,7 +355,9 @@ def __post_init__(self):
                     self.method = "medusa"
                 elif self.draft_model_config.hf_config.model_type == "mlp_speculator":
                     self.method = "mlp_speculator"
-                elif self.draft_model_config.hf_config.model_type in MTP_MODEL_TYPES:
+                elif self.draft_model_config.hf_config.model_type in get_args(
+                    MTPModelTypes
+                ):
                     self.method = "mtp"
                     if self.num_speculative_tokens > 1:
                         logger.warning(
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 7e0878d96bbd..c4e9a5ef6ff5 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -6,7 +6,7 @@
 import inspect
 import textwrap
 from collections.abc import Iterable
-from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
+from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 
@@ -176,3 +176,37 @@ def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT:
             )
         processed_overrides[field_name] = value
     return replace(config, **processed_overrides)
+
+
+@dataclass
+class Range:
+    """
+    A range of numbers.
+    Inclusive of start, inclusive of end.
+    """
+
+    start: int
+    end: int
+
+    def is_single_size(self) -> bool:
+        return self.start == self.end
+
+    def __contains__(self, size: int) -> bool:
+        # Inclusive of start, inclusive of end
+        if self.is_single_size():
+            return size == self.start
+        return self.start <= size <= self.end
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Range):
+            return False
+        return self.start == other.start and self.end == other.end
+
+    def __hash__(self) -> int:
+        return hash((self.start, self.end))
+
+    def __str__(self) -> str:
+        return f"[{self.start}, {self.end}]"
+
+    def __repr__(self) -> str:
+        return self.__str__()
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index d4ee6f980e6e..26abd3fa4a4e 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -14,13 +14,14 @@
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar, get_args
 
 import torch
 from pydantic import ConfigDict, Field, model_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
+from vllm.config.speculative import EagleModelTypes
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
@@ -28,6 +29,7 @@
 from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
 from .device import DeviceConfig
+from .ec_transfer import ECTransferConfig
 from .kv_events import KVEventsConfig
 from .kv_transfer import KVTransferConfig
 from .load import LoadConfig
@@ -103,6 +105,8 @@ class VllmConfig:
     """The configurations for distributed KV cache transfer."""
     kv_events_config: KVEventsConfig | None = None
     """The configurations for event publishing."""
+    ec_transfer_config: ECTransferConfig | None = None
+    """The configurations for distributed EC cache transfer."""
     # some opaque config, only used to provide additional information
     # for the hash computation, mainly used for testing, debugging or out of
     # tree config registration.
@@ -183,6 +187,10 @@ def compute_hash(self) -> str:
             vllm_factors.append(self.kv_transfer_config.compute_hash())
         else:
             vllm_factors.append("None")
+        if self.ec_transfer_config:
+            vllm_factors.append(self.ec_transfer_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.additional_config:
             if isinstance(additional_config := self.additional_config, dict):
                 additional_config_hash = hashlib.md5(
@@ -367,10 +375,22 @@ def __post_init__(self):
                     "Async scheduling is not yet compatible with "
                     "pipeline_parallel_size > 1."
                 )
+            # Currently, async scheduling only support eagle speculative
+            # decoding.
             if self.speculative_config is not None:
-                raise ValueError(
-                    "Async scheduling is not yet compatible with speculative decoding."
-                )
+                if self.speculative_config.method not in get_args(EagleModelTypes):
+                    raise ValueError(
+                        "Currently, async scheduling is only supported "
+                        "with EAGLE/MTP kind of speculative decoding"
+                    )
+                if self.speculative_config.disable_padded_drafter_batch:
+                    raise ValueError(
+                        "async scheduling for EAGLE/MTP kind of speculative "
+                        "decoding is enabled, but disable_padded_drafter_batch=True "
+                        "disable_padded_drafter_batch=True is not supported for "
+                        "this situation now. please set "
+                        "disable_padded_drafter_batch=Fasle"
+                    )
             if not executor_supports_async_sched:
                 raise ValueError(
                     "Currently, async scheduling only supports `mp`, `uni`, or "
@@ -404,7 +424,7 @@ def __post_init__(self):
 
         if (
             self.model_config is not None
-            and self.scheduler_config.chunked_prefill_enabled
+            and self.scheduler_config.enable_chunked_prefill
             and self.model_config.dtype == torch.float32
             and current_platform.get_device_capability() == (7, 5)
         ):
@@ -422,16 +442,13 @@ def __post_init__(self):
                 self.compilation_config.mode = CompilationMode.VLLM_COMPILE
             else:
                 self.compilation_config.mode = CompilationMode.NONE
-        else:
-            assert self.compilation_config.mode >= CompilationMode.NONE
-            assert self.compilation_config.mode <= CompilationMode.VLLM_COMPILE
 
         # If user does not set custom ops via none or all set it here based on
         # compilation mode and backend.
         if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
             if (
                 self.compilation_config.backend == "inductor"
-                and self.compilation_config.mode > CompilationMode.NONE
+                and self.compilation_config.mode != CompilationMode.NONE
             ):
                 self.compilation_config.custom_ops.append("none")
             else:
@@ -441,8 +458,6 @@ def __post_init__(self):
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
             self.compilation_config.pass_config.enable_sequence_parallelism = True
-        if self.compilation_config.pass_config.enable_sequence_parallelism:
-            self.compilation_config.custom_ops.append("+rms_norm")
 
         if current_platform.support_static_graph_mode():
             # if cudagraph_mode is not explicitly set by users, set default
@@ -479,21 +494,6 @@ def __post_init__(self):
                             "Overriding cudagraph_mode to PIECEWISE."
                         )
                         self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                    elif (
-                        current_platform.is_cuda()
-                        and current_platform.is_device_capability(100)
-                        and self.model_config.max_model_len > 131072
-                        and not self.model_config.use_mla
-                    ):
-                        # Refer to vllm/utils/flashinfer.py::use_trtllm_attention()
-                        logger.warning_once(
-                            "NVIDIA Blackwell TRTLLM attention cannot support "
-                            "max_model_len >= 131072 (found "
-                            f"{self.model_config.max_model_len}), causing dynamic "
-                            "dispatching that breaks full cudagraphs. "
-                            "Overriding cudagraph_mode to PIECEWISE."
-                        )
-                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
             # disable cudagraph when enforce eager execution
             if self.model_config is not None and self.model_config.enforce_eager:
@@ -526,6 +526,8 @@ def __post_init__(self):
                 "correctness and to realize prefill savings. "
             )
 
+        self._set_compile_ranges()
+
         disable_chunked_prefill_reasons: list[str] = []
 
         if self.model_config:
@@ -580,7 +582,7 @@ def __post_init__(self):
         ):
             for reason in disable_chunked_prefill_reasons:
                 logger.info(reason)
-            self.scheduler_config.chunked_prefill_enabled = False
+            self.scheduler_config.enable_chunked_prefill = False
             self.scheduler_config.long_prefill_token_threshold = 0
 
             if self.cache_config is not None:
@@ -608,17 +610,19 @@ def __post_init__(self):
             )
         current_platform.check_and_update_config(self)
 
-        assert (
-            self.parallel_config.dcp_kv_cache_interleave_size
-            <= self.cache_config.block_size
-            and self.cache_config.block_size
-            % self.parallel_config.dcp_kv_cache_interleave_size
-            == 0
-        ), (
-            f"Block_size({self.cache_config.block_size}) should be "
-            "greater than or equal to and divisible by dcp_kv_cache_interleave_size "
-            f"({self.parallel_config.dcp_kv_cache_interleave_size})."
-        )
+        # If DCP, ensure the block size is right.
+        if self.parallel_config.decode_context_parallel_size > 1:
+            assert (
+                self.parallel_config.dcp_kv_cache_interleave_size
+                <= self.cache_config.block_size
+                and self.cache_config.block_size
+                % self.parallel_config.dcp_kv_cache_interleave_size
+                == 0
+            ), (
+                f"Block_size({self.cache_config.block_size}) should be greater "
+                "than or equal to and divisible by dcp_kv_cache_interleave_size "
+                f"({self.parallel_config.dcp_kv_cache_interleave_size})."
+            )
 
         assert (
             self.parallel_config.dcp_kv_cache_interleave_size == 1
@@ -629,6 +633,32 @@ def __post_init__(self):
         if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
             self.compilation_config.set_splitting_ops_for_v1()
 
+        if self.compilation_config.pass_config.enable_sequence_parallelism:
+            # With pipeline parallelism or dynamo partitioning,
+            # native rms norm tracing errors due to incorrect residual shape.
+            # Use custom rms norm to unblock. In the future,
+            # the pass will operate on higher-level IR to avoid the issue.
+            # TODO: https://github.com/vllm-project/vllm/issues/27894
+            is_fullgraph = (
+                self.compilation_config.use_inductor_graph_partition
+                or len(self.compilation_config.splitting_ops) == 0
+            )
+            if self.parallel_config.pipeline_parallel_size > 1 or not is_fullgraph:
+                if "-rms_norm" not in self.compilation_config.custom_ops:
+                    self.compilation_config.custom_ops.append("+rms_norm")
+                else:
+                    regime = (
+                        "Dynamo partition"
+                        if not is_fullgraph
+                        else "pipeline parallelism"
+                    )
+                    logger.warning_once(
+                        "Sequence parallelism not supported with"
+                        "native rms_norm when using %s, "
+                        "this will likely lead to an error.",
+                        regime,
+                    )
+
         # final check of cudagraph mode after all possible updates
         if current_platform.is_cuda_alike():
             if (
@@ -650,14 +680,6 @@ def __post_init__(self):
                     f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                 )
 
-            # final migrate the deprecated flags
-            self.compilation_config.use_cudagraph = (
-                self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            )
-            self.compilation_config.full_cuda_graph = (
-                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-            )
-
         if self.parallel_config.enable_dbo:
             a2a_backend = self.parallel_config.all2all_backend
             assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], (
@@ -847,7 +869,9 @@ def _set_cudagraph_sizes(self):
                 )
                 # de-duplicate the sizes provided by the config
                 dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes))
-                cudagraph_capture_sizes = dedup_sizes
+                cudagraph_capture_sizes = [
+                    i for i in dedup_sizes if i <= max_num_tokens
+                ]
                 # sort to make sure the sizes are in ascending order
                 cudagraph_capture_sizes.sort()
             else:
@@ -924,12 +948,52 @@ def _set_cudagraph_sizes(self):
         # complete the remaining process.
         self.compilation_config.post_init_cudagraph_sizes()
 
+    def _set_compile_ranges(self):
+        """
+        Set the compile ranges for the compilation config.
+        """
+        compilation_config = self.compilation_config
+        computed_compile_ranges_split_points = []
+
+        # The upper bound of the compile ranges is the max_num_batched_tokens
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        if max_num_batched_tokens is not None:
+            computed_compile_ranges_split_points.append(max_num_batched_tokens)
+
+        # Add the compile ranges for flashinfer
+        if compilation_config.pass_config.enable_fi_allreduce_fusion:
+            tp_size = self.parallel_config.tensor_parallel_size
+            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
+            if max_size is not None:
+                max_token_num = max_size // (
+                    self.model_config.get_hidden_size()
+                    * self.model_config.dtype.itemsize
+                )
+                if (
+                    max_num_batched_tokens is not None
+                    and max_token_num < max_num_batched_tokens
+                ):
+                    computed_compile_ranges_split_points.append(max_token_num)
+
+        if compilation_config.compile_ranges_split_points is not None:
+            for x in compilation_config.compile_ranges_split_points:
+                assert isinstance(x, int)
+                assert x > 0, f"Invalid compile range split point: {x}"
+                if (
+                    max_num_batched_tokens is not None
+                    and x < max_num_batched_tokens
+                    and x > 1
+                ):
+                    computed_compile_ranges_split_points.append(x)
+        compilation_config.compile_ranges_split_points = sorted(
+            computed_compile_ranges_split_points
+        )  # type: ignore
+
     def recalculate_max_model_len(self, max_model_len: int):
         # Can only be called in try_verify_and_update_config
         model_config = self.model_config
         max_model_len = model_config.get_and_verify_max_len(max_model_len)
         self.model_config.max_model_len = max_model_len
-        self.scheduler_config.max_model_len = max_model_len
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
@@ -1026,7 +1090,7 @@ def __str__(self):
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
-            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
+            f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, "  # noqa
             f"pooler_config={self.model_config.pooler_config!r}, "
             f"compilation_config={self.compilation_config!r}"
         )
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 5e3dbde393be..e9695698bb49 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -63,7 +63,7 @@ def find_loaded_library(lib_name) -> str | None:
     libcudart = CudaRTLibrary()
     cumem_available = True
 except ModuleNotFoundError:
-    # rocm platform does not support cumem allocator
+    # only cuda and rocm platforms support cumem allocator
     init_module = None
     python_create_and_map = None
     python_unmap_and_release = None
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 07ab2f712409..6aadab33e313 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -14,6 +14,7 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -105,6 +106,20 @@ class CudaRTLibrary:
         ),
     ]
 
+    # https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Runtime_API_functions_supported_by_HIP.html # noqa
+    cuda_to_hip_mapping = {
+        "cudaSetDevice": "hipSetDevice",
+        "cudaDeviceSynchronize": "hipDeviceSynchronize",
+        "cudaDeviceReset": "hipDeviceReset",
+        "cudaGetErrorString": "hipGetErrorString",
+        "cudaMalloc": "hipMalloc",
+        "cudaFree": "hipFree",
+        "cudaMemset": "hipMemset",
+        "cudaMemcpy": "hipMemcpy",
+        "cudaIpcGetMemHandle": "hipIpcGetMemHandle",
+        "cudaIpcOpenMemHandle": "hipIpcOpenMemHandle",
+    }
+
     # class attribute to store the mapping from the path to the library
     # to avoid loading the same library multiple times
     path_to_library_cache: dict[str, Any] = {}
@@ -117,7 +132,13 @@ def __init__(self, so_file: str | None = None):
         if so_file is None:
             so_file = find_loaded_library("libcudart")
             if so_file is None:
-                so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
+                # libcudart is not loaded in the current process, try hip
+                so_file = find_loaded_library("libamdhip64")
+                # should be safe to assume now that we are using ROCm
+                # as the following assertion should error out if the
+                # libhiprtc library is also not loaded
+                if so_file is None:
+                    so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
             assert so_file is not None, (
                 "libcudart is not loaded in the current process, "
                 "try setting VLLM_CUDART_SO_PATH"
@@ -130,7 +151,12 @@ def __init__(self, so_file: str | None = None):
         if so_file not in CudaRTLibrary.path_to_dict_mapping:
             _funcs = {}
             for func in CudaRTLibrary.exported_functions:
-                f = getattr(self.lib, func.name)
+                f = getattr(
+                    self.lib,
+                    CudaRTLibrary.cuda_to_hip_mapping[func.name]
+                    if current_platform.is_rocm()
+                    else func.name,
+                )
                 f.restype = func.restype
                 f.argtypes = func.argtypes
                 _funcs[func.name] = f
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 5046cac2e90a..052df19e34d7 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -8,7 +8,7 @@
 from multiprocessing import shared_memory
 from pickle import PickleBuffer
 from threading import Event
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import patch
 
 import torch
@@ -602,13 +602,87 @@ def broadcast_object(self, obj=None):
             return obj
         return self.dequeue()
 
+    @staticmethod
+    def create_from_process_group_single_reader(
+        pg: ProcessGroup,
+        max_chunk_bytes,
+        max_chunks,
+        reader_rank: int = 0,
+        blocking: bool = False,
+    ) -> tuple["MessageQueue", list[Handle]]:
+        """
+        Creates a MessageQueue for a process group with a single reader.
+
+        This method is designed for scenarios where only one process (the reader)
+        will consume messages, and all other processes are writers. It sets up
+        the shared memory buffer and communication handles accordingly, and
+        gathers the handles from all processes to the reader.
+
+        Args:
+            pg (ProcessGroup): The torch distributed process group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            reader_rank (int, optional): The global rank that will act as the reader.
+                Defaults to 0.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to False.
+
+        Returns:
+            tuple[MessageQueue, list[Handle]]:
+            The MessageQueue instance for the calling process,
+            and a list of handles (only non-empty for the reader process).
+        """
+        local_size = torch.cuda.device_count()
+        rank = dist.get_rank()
+        same_node = rank // local_size == reader_rank // local_size
+        buffer_io = MessageQueue(
+            n_reader=1,
+            n_local_reader=1 if same_node else 0,
+            max_chunk_bytes=max_chunk_bytes,
+            max_chunks=max_chunks,
+        )
+        handle = buffer_io.export_handle()
+        handles = [None] * dist.get_world_size(pg) if rank == reader_rank else None
+        dist.gather_object(handle, handles, dst=reader_rank, group=pg)
+        if blocking:
+            buffer_io.wait_until_ready()
+        return buffer_io, cast(list[Handle], handles or [])
+
     @staticmethod
     def create_from_process_group(
         pg: ProcessGroup | StatelessProcessGroup,
         max_chunk_bytes,
         max_chunks,
-        writer_rank=0,
+        writer_rank: int = 0,
+        external_writer_handle=None,
+        blocking: bool = True,
     ) -> "MessageQueue":
+        """
+        Creates a MessageQueue for a distributed process group with one writer and
+        multiple readers.
+
+        This method is designed for scenarios where one process (the writer) sends
+        messages, and all other processes (the readers) receive messages. It sets up
+        the shared memory buffer and socket communication handles accordingly, and
+        broadcasts the handle from the writer to all readers.
+
+        Args:
+            pg (ProcessGroup | StatelessProcessGroup): The torch distributed process
+                group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            writer_rank (int, optional): The global rank that will act as the writer.
+                Defaults to 0.
+            external_writer_handle (Handle, optional): Used when there is a handle
+                from an external Message Queue. If provided, use this handle to init
+                PG writer message queue instead of creating a new one. Defaults to None.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to True.
+
+        Returns:
+            MessageQueue: The MessageQueue instance for the calling process.
+
+        """
         if isinstance(pg, ProcessGroup):
             group_rank = dist.get_rank(pg)
             group_world_size = dist.get_world_size(pg)
@@ -617,23 +691,26 @@ def create_from_process_group(
             group_rank = pg.rank
             group_world_size = pg.world_size
             global_ranks = list(range(pg.world_size))
-
         from vllm.distributed.parallel_state import in_the_same_node_as
 
         status = in_the_same_node_as(pg, source_rank=writer_rank)
-        same_node_ranks = [i for i, s in enumerate(status) if s]
-        n_reader = group_world_size - 1
-        n_local_reader = len(same_node_ranks) - 1
-        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
-        buffer_io: MessageQueue
         if group_rank == writer_rank:
-            buffer_io = MessageQueue(
-                n_reader=n_reader,
-                n_local_reader=n_local_reader,
-                local_reader_ranks=local_reader_ranks,
-                max_chunk_bytes=max_chunk_bytes,
-                max_chunks=max_chunks,
-            )
+            if external_writer_handle is not None:
+                buffer_io = MessageQueue.create_from_handle(
+                    external_writer_handle, group_rank
+                )
+            else:
+                same_node_ranks = [i for i, s in enumerate(status) if s]
+                n_reader = group_world_size - 1
+                n_local_reader = len(same_node_ranks) - 1
+                local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+                buffer_io = MessageQueue(
+                    n_reader=n_reader,
+                    n_local_reader=n_local_reader,
+                    local_reader_ranks=local_reader_ranks,
+                    max_chunk_bytes=max_chunk_bytes,
+                    max_chunks=max_chunks,
+                )
             handle = buffer_io.export_handle()
             if isinstance(pg, ProcessGroup):
                 dist.broadcast_object_list(
@@ -651,5 +728,6 @@ def create_from_process_group(
             else:
                 handle = pg.broadcast_obj(None, writer_rank)
             buffer_io = MessageQueue.create_from_handle(handle, group_rank)
-        buffer_io.wait_until_ready()
+        if blocking:
+            buffer_io.wait_until_ready()
         return buffer_io
diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py
index 2ec33afb8783..4af2caa16b0d 100644
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@@ -342,8 +342,8 @@ def __init__(self):
         from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
         self.encoder = MsgpackEncoder()
-        self.tensor_decoder = MsgpackDecoder(torch.Tensor)
-        self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem)
+        self.tensor_decoder = MsgpackDecoder(torch.Tensor, share_mem=False)
+        self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem, share_mem=False)
         self._mm_kwargs_item_cls = MultiModalKwargsItem
 
     def serialize(self, value: Any) -> tuple[bytes | list[bytes], int, bytes, int]:
@@ -368,7 +368,7 @@ def deserialize(self, data_view: memoryview) -> Any:
         # pickle.loads do not read past the end of a pickled object
         # within a large buffer, so we can skip storing the metadata size
         type_name, nbytes, len_arr = pickle.loads(data_view)
-        serialized_data = bytearray(data_view[-nbytes:])
+        serialized_data = data_view[-nbytes:]
 
         if type_name == torch.Tensor.__name__:
             obj = []
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index 74d6fb40c83b..eb1f173b1192 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -88,13 +88,21 @@ def __init__(
             self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
                 self.world_size
             ]
-
-        self.buffer = torch_symm_mem.empty(
-            self.max_size // self.dtype.itemsize,
-            device=self.device,
-            dtype=self.dtype,
-        )
-        handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+        try:
+            self.buffer = torch_symm_mem.empty(
+                self.max_size // self.dtype.itemsize,
+                device=self.device,
+                dtype=self.dtype,
+            )
+            handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+        except RuntimeError as e:
+            logger.warning_once(
+                "SymmMemCommunicator: symmetric memory initialization failed: %s "
+                "Communicator is not available. To suppress this warning set "
+                "VLLM_ALLREDUCE_USE_SYMM_MEM=0",
+                str(e),
+            )
+            return
         if handle.multicast_ptr == 0:
             logger.warning(
                 "SymmMemCommunicator: symmetric memory "
diff --git a/vllm/distributed/ec_transfer/__init__.py b/vllm/distributed/ec_transfer/__init__.py
new file mode 100644
index 000000000000..0decfd143e34
--- /dev/null
+++ b/vllm/distributed/ec_transfer/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.distributed.ec_transfer.ec_transfer_state import (
+    ensure_ec_transfer_initialized,
+    get_ec_transfer,
+    has_ec_transfer,
+)
+
+__all__ = [
+    "get_ec_transfer",
+    "ensure_ec_transfer_initialized",
+    "has_ec_transfer",
+]
diff --git a/vllm/distributed/ec_transfer/ec_connector/__init__.py b/vllm/distributed/ec_transfer/ec_connector/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/distributed/ec_transfer/ec_connector/base.py b/vllm/distributed/ec_transfer/ec_connector/base.py
new file mode 100644
index 000000000000..2b7b14d89b8a
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_connector/base.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+ECConnectorBase Class for Distributed Encoder Cache &
+P2P Encoder cache communication in V1
+
+The class provides the following primitives:
+    Scheduler-side: runs in the scheduler, binds metadata, which
+    is used by the worker-side to load/save Encoder cache.
+        check_caches_exist() - Check whether Encoder cache of requests exist
+        update_state_after_alloc() - update ECConnector state after
+        allocate. This will decide to load the cache or not
+        request_finished() - called when a request is finished,
+        free the cache with the requests
+
+    Worker-side: runs in each worker, loads/saves Encoder Cache to/from
+    the Connector based on the metadata.
+        start_load_ec() - starts loading all ECs (maybe async)
+        wait_for_save() - blocks until all saves are done
+
+        get_finished() - called with ids of finished requests, returns
+            ids of requests that have completed async sending/recving.
+"""
+
+import enum
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ECConnectorOutput
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class ECConnectorRole(enum.Enum):
+    # Connector running in the scheduler process
+    SCHEDULER = 0
+
+    # Connector running in the worker process
+    WORKER = 1
+
+
+class ECConnectorMetadata(ABC):  # noqa: B024
+    """
+    Abstract Metadata used to communicate between the
+    Scheduler ECConnector and Worker ECConnector.
+    """
+
+    pass
+
+
+class ECConnectorBase(ABC):
+    def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole):
+        self._connector_metadata: ECConnectorMetadata | None = None
+        self._vllm_config = vllm_config
+        self._role = role
+        if vllm_config.ec_transfer_config is not None:
+            self._is_producer = vllm_config.ec_transfer_config.is_ec_producer
+        else:
+            raise ValueError("ec_transfer_config must be set for ECConnectorBase")
+
+    @property
+    def role(self) -> ECConnectorRole:
+        return self._role
+
+    @property
+    def is_producer(self) -> bool:
+        return self._is_producer
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def bind_connector_metadata(self, connector_metadata: ECConnectorMetadata) -> None:
+        """Set the connector metadata from the scheduler.
+
+        This function should be called by the model runner every time
+        before the model execution. The metadata will be used for runtime
+        EC cache loading.
+
+        Args:
+            connector_metadata (dict): the connector metadata.
+        """
+        self._connector_metadata = connector_metadata
+
+    def clear_connector_metadata(self) -> None:
+        """Clear the connector metadata.
+
+        This function should be called by the model runner every time
+        after the model execution.
+        """
+        self._connector_metadata = None
+
+    def _get_connector_metadata(self) -> ECConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
+        return self._connector_metadata
+
+    def register_caches(
+        self,
+        ec_caches: dict[str, torch.Tensor],
+    ):
+        """
+        Initialize with the EC caches.
+        Args:
+            ec_caches: dictionary of encoder cache
+        """
+        # TODO: Implement this later for P2P feature
+        return
+
+    @abstractmethod
+    def start_load_caches(
+        self, encoder_cache: dict[str, torch.Tensor], **kwargs
+    ) -> None:
+        """
+        Start loading the cache from the connector into vLLM's encoder cache.
+
+        This method loads the encoder cache based on metadata provided by the scheduler.
+        It is called before `_gather_mm_embeddings` for the EC Connector. For EC,
+        the `encoder_cache` and `mm_hash` are stored in `kwargs`.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        pass
+
+    @abstractmethod
+    def save_caches(
+        self, encoder_cache: dict[str, torch.Tensor], mm_hash: str, **kwargs
+    ) -> None:
+        """
+        Save the encoder cache to the connector.
+
+        This method saves the encoder cache from the worker's local storage
+        to shared storage or another external connector.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            mm_hash (str): The hash of the multimodal data whose cache is being saved.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        pass
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        return None, None
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    @abstractmethod
+    def has_caches(
+        self,
+        request: "Request",
+    ) -> list[bool]:
+        """
+        Check if encoder cache exists for each mm data of requests
+
+        Args:
+            request (Request): the request object.
+
+        Returns:
+            A list bool where ith value is True if cache exist for
+            ith mm_data of requests
+        """
+        pass
+
+    @abstractmethod
+    def update_state_after_alloc(self, request: "Request", index: int):
+        """
+        Update ECConnector state to decide allocate cache for requests
+
+        Args:
+            request (Request): the request object.
+        """
+        pass
+
+    @abstractmethod
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> ECConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        pass
+
+    def update_connector_output(self, connector_output: ECConnectorOutput):
+        """
+        Update ECConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (ECConnectorOutput): the worker-side
+                connectors output.
+        """
+        return
+
+    def request_finished(
+        self, request: "Request"
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called when a request has finished, before its encoder cache is freed.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and cached
+            should not be freed until the request_id is returned from
+            get_finished().
+        """
+        return False, None
diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py
new file mode 100644
index 000000000000..bfdf51d775bd
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_connector/factory.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+# yapf: disable
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorRole,
+)
+from vllm.logger import init_logger
+
+# yapf: enable
+
+if TYPE_CHECKING:
+    from vllm.config import ECTransferConfig, VllmConfig
+
+logger = init_logger(__name__)
+
+
+class ECConnectorFactory:
+    _registry: dict[str, Callable[[], type[ECConnectorBase]]] = {}
+
+    @classmethod
+    def register_connector(cls, name: str, module_path: str, class_name: str) -> None:
+        """Register a connector with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+
+        def loader() -> type[ECConnectorBase]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+
+        cls._registry[name] = loader
+
+    @classmethod
+    def create_connector(
+        cls,
+        config: "VllmConfig",
+        role: ECConnectorRole,
+    ) -> ECConnectorBase:
+        ec_transfer_config = config.ec_transfer_config
+        if ec_transfer_config is None:
+            raise ValueError("ec_transfer_config must be set to create a connector")
+        connector_cls = cls.get_connector_class(ec_transfer_config)
+        logger.info(
+            "Creating connector with name: %s and engine_id: %s",
+            connector_cls.__name__,
+            ec_transfer_config.engine_id,
+        )
+        # Connector is explicitly separated into two roles.
+        # Scheduler connector:
+        # - Co-locate with scheduler process
+        # - Should only be used inside the Scheduler class
+        # Worker connector:
+        # - Co-locate with worker process
+        return connector_cls(config, role)
+
+    @classmethod
+    def get_connector_class(
+        cls, ec_transfer_config: "ECTransferConfig"
+    ) -> type[ECConnectorBase]:
+        """Get the connector class by name."""
+        connector_name = ec_transfer_config.ec_connector
+        if connector_name is None:
+            raise ValueError("EC connect must not be None")
+        elif connector_name in cls._registry:
+            connector_cls = cls._registry[connector_name]()
+        else:
+            connector_module_path = ec_transfer_config.ec_connector_module_path
+            if connector_module_path is None:
+                raise ValueError(f"Unsupported connector type: {connector_name}")
+            connector_module = importlib.import_module(connector_module_path)
+            connector_cls = getattr(connector_module, connector_name)
+        return connector_cls
+
+
+# Register various connectors here.
+# The registration should not be done in each individual file, as we want to
+# only load the files corresponding to the current connector.
+
+ECConnectorFactory.register_connector(
+    "ECSharedStorageConnector",
+    "vllm.distributed.ec_transfer.ec_connector.shared_storage_connector",
+    "ECSharedStorageConnector",
+)
diff --git a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
new file mode 100644
index 000000000000..c8388141dcc9
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import safetensors
+
+from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorMetadata,
+    ECConnectorRole,
+)
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MMMeta:
+    mm_hash: str
+    num_token: int
+
+    @staticmethod
+    def make_meta(mm_hash, num_token) -> "MMMeta":
+        return MMMeta(mm_hash=mm_hash, num_token=num_token)
+
+
+@dataclass
+class ECSharedStorageConnectorMetadata(ECConnectorMetadata):
+    mm_datas: list[MMMeta]
+
+    def __init__(self):
+        self.mm_datas = []
+
+    def add_mm_data(self, mm_data: MMMeta):
+        self.mm_datas.append(mm_data)
+
+
+class ECSharedStorageConnector(ECConnectorBase):
+    # NOTE: This is Simple debug implementation of the EC connector.
+    # It save / load the EC cache to / from the disk.
+
+    def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        # req_id -> index
+        self._mm_datas_need_loads: dict[str, int] = {}
+        transfer_config = vllm_config.ec_transfer_config
+        if transfer_config is not None:
+            self._storage_path = transfer_config.get_from_extra_config(
+                "shared_storage_path", "/tmp"
+            )
+            logger.debug(transfer_config)
+            logger.debug("Shared storage path is %s", self._storage_path)
+        else:
+            raise ValueError("ec_transfer_config must be set for ECConnectorBase")
+
+    def start_load_caches(self, encoder_cache, **kwargs) -> None:
+        """
+        Start loading the cache from the connector into vLLM's encoder cache.
+
+        This method loads the encoder cache based on metadata provided by the scheduler.
+        It is called before `_gather_mm_embeddings` for the EC Connector. For EC,
+        the `encoder_cache` and `mm_hash` are stored in `kwargs`.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+
+        # Get the metadata
+        metadata: ECConnectorMetadata = self._get_connector_metadata()
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert encoder_cache is not None
+        if metadata is None:
+            logger.warning(
+                (
+                    "In connector.start_load_caches, ",
+                    "but the connector metadata is None",
+                )
+            )
+            return
+        # Load the EC for each mm data
+        for mm_data in metadata.mm_datas:
+            if mm_data.mm_hash in encoder_cache:
+                continue
+            filename = self._generate_filename_debug(mm_data.mm_hash)
+            ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda()
+            encoder_cache[mm_data.mm_hash] = ec_cache
+            logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash)
+
+    def save_caches(self, encoder_cache, mm_hash, **kwargs) -> None:
+        """
+        Save the encoder cache to the connector.
+
+        This method saves the encoder cache from the worker's local storage
+        to shared storage or another external connector.
+
+        Args:
+            encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal
+                data hashes (`mm_hash`) to encoder cache tensors.
+            mm_hash (str): The hash of the multimodal data whose cache is being saved.
+            kwargs (dict): Additional keyword arguments for the connector.
+        """
+        # Return if it is PD Instance
+        if not self.is_producer:
+            return
+        filename = self._generate_filename_debug(mm_hash)
+        ec_cache = encoder_cache[mm_hash]
+        tensors = {"ec_cache": ec_cache.detach().cpu()}
+        safetensors.torch.save_file(tensors, filename)
+        logger.debug("Save cache successful for mm_hash %s", mm_hash)
+
+    def has_caches(
+        self,
+        request: "Request",
+    ) -> list[bool]:
+        """
+        Check if cache exist externally for each mm_data of request
+
+        Args:
+            request (Request): the request object.
+
+        Returns:
+            List of bool indicate that ith mm_data exist in cache or not
+        """
+        result = []
+        for feature in request.mm_features:
+            result.append(self._found_match_for_mm_data(feature.identifier))
+        return result
+
+    def update_state_after_alloc(
+        self,
+        request: "Request",
+        index: int,
+    ) -> None:
+        """
+        Update ECConnector state after encoder cache allocation.
+        """
+        mm_hash = request.mm_features[index].identifier
+        num_encoder_token = request.get_num_encoder_tokens(index)
+        # Insert mm_hash only if this block has not been recorded yet.
+        self._mm_datas_need_loads[mm_hash] = num_encoder_token
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> ECConnectorMetadata:
+        """Build the connector metadata for this step.
+
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+        This only build for load mm_data only
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        meta = ECSharedStorageConnectorMetadata()
+        for mm_hash, num_encoder_token in self._mm_datas_need_loads.items():
+            meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token))
+        self._mm_datas_need_loads.clear()
+        return meta
+
+    # ==============================
+    # Helper functions
+    # ==============================
+
+    def _found_match_for_mm_data(self, mm_hash) -> bool:
+        """Check if the cache is hit for the request."""
+        filename = self._generate_filename_debug(mm_hash)
+        return os.path.exists(filename)
+
+    def _generate_foldername_debug(
+        self,
+        mm_hash: str,
+        create_folder: bool = True,  # <- now defaults to True
+    ) -> str:
+        """
+        Return the folder in which the cache for this mm_hash lives.
+        If `create_folder` is True (default) the directory is created
+        recursively the first time it is needed.
+        """
+        foldername = os.path.join(self._storage_path, mm_hash)
+        if create_folder:
+            os.makedirs(foldername, exist_ok=True)
+        return foldername
+
+    def _generate_filename_debug(self, mm_hash: str) -> str:
+        """
+        Return the full path of the safetensors file for this mm_hash.
+        Ensures the parent directory exists because
+        `_generate_foldername_debug` is called with its default
+        (`create_folder=True`).
+        """
+        foldername = self._generate_foldername_debug(mm_hash)  # <- folder auto-created
+        return os.path.join(foldername, "encoder_cache.safetensors")
diff --git a/vllm/distributed/ec_transfer/ec_transfer_state.py b/vllm/distributed/ec_transfer/ec_transfer_state.py
new file mode 100644
index 000000000000..ef3c978b36a3
--- /dev/null
+++ b/vllm/distributed/ec_transfer/ec_transfer_state.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
+
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorBase,
+    ECConnectorRole,
+)
+from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+_EC_CONNECTOR_AGENT: ECConnectorBase | None = None
+
+
+def get_ec_transfer() -> ECConnectorBase:
+    assert _EC_CONNECTOR_AGENT is not None, "disaggregated EC cache is not initialized"
+    return _EC_CONNECTOR_AGENT
+
+
+def has_ec_transfer() -> bool:
+    return _EC_CONNECTOR_AGENT is not None
+
+
+def ensure_ec_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize EC cache connector.
+    """
+
+    global _EC_CONNECTOR_AGENT
+
+    if vllm_config.ec_transfer_config is None:
+        return
+
+    if (
+        vllm_config.ec_transfer_config.is_ec_transfer_instance
+        and _EC_CONNECTOR_AGENT is None
+    ):
+        _EC_CONNECTOR_AGENT = ECConnectorFactory.create_connector(
+            config=vllm_config, role=ECConnectorRole.WORKER
+        )
diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py
index c9d30d6481ab..e6645e524cc3 100644
--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@@ -12,6 +12,7 @@
 on how the EPLB algorithm works.
 """
 
+import numpy as np
 import torch
 
 
@@ -34,29 +35,44 @@ def balanced_packing(
     assert num_groups % num_packs == 0
     groups_per_pack = num_groups // num_packs
 
+    device = weight.device
+
     if groups_per_pack == 1:
         pack_index = torch.arange(
-            weight.size(-1), dtype=torch.int64, device=weight.device
+            weight.size(-1), dtype=torch.int64, device=device
         ).expand(weight.shape)
-        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
         return pack_index, rank_in_pack
 
-    indices = weight.float().sort(-1, descending=True).indices.cpu()
-    pack_index = torch.full_like(weight, fill_value=-1, dtype=torch.int64, device="cpu")
-    rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+    weight_np = weight.cpu().numpy()
+
+    # Sort and get indices in decending order
+    indices_np = np.argsort(-weight_np, axis=-1)
+
+    pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+    rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
+
+    # Run the packing algorithm
     for i in range(num_layers):
-        pack_weights = [0] * num_packs
+        pack_weights = [0.0] * num_packs
         pack_items = [0] * num_packs
-        for group in indices[i]:
+
+        for group in indices_np[i]:
+            # Find a pack with capacity that has the lowest weight
             pack = min(
-                (i for i in range(num_packs) if pack_items[i] < groups_per_pack),
+                (j for j in range(num_packs) if pack_items[j] < groups_per_pack),
                 key=pack_weights.__getitem__,
             )
+
             assert pack_items[pack] < groups_per_pack
-            pack_index[i, group] = pack
-            rank_in_pack[i, group] = pack_items[pack]
-            pack_weights[pack] += weight[i, group]
+            pack_index_np[i, group] = pack
+            rank_in_pack_np[i, group] = pack_items[pack]
+            pack_weights[pack] += weight_np[i, group]
             pack_items[pack] += 1
+
+    pack_index = torch.from_numpy(pack_index_np).to(device)
+    rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
+
     return pack_index, rank_in_pack
 
 
@@ -212,7 +228,7 @@ def rebalance_experts(
             replicas for each logical expert
     """
     num_layers, num_logical_experts = weight.shape
-    weight = weight.float().cpu()
+    weight = weight.float()
     if num_groups % num_nodes == 0:
         # use hierarchical load-balance policy
         phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index f8ec3e956401..5c1efbaf03ba 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -321,15 +321,19 @@ def rearrange_expert_weights_inplace(
             )
         return
 
+    old_global_expert_indices_cpu = old_global_expert_indices.cpu()
+    new_global_expert_indices_cpu = new_global_expert_indices.cpu()
+
+    # NOTE(bowen): We need this synchronize to run, but I don't know why.
+    # If you figure out the reason, please let me know -- thank you!
+    torch.cuda.synchronize()
+
     for layer in range(num_moe_layers):
-        # NOTE(bowen): We need this synchronize to run, but I don't know why.
-        # If you figure out the reason, please let me know -- thank you!
-        torch.cuda.synchronize()
         shuffle_layer(
             num_local_physical_experts,
             ep_rank,
-            old_global_expert_indices[layer].tolist(),
-            new_global_expert_indices[layer].tolist(),
+            old_global_expert_indices_cpu[layer].tolist(),
+            new_global_expert_indices_cpu[layer].tolist(),
             expert_weights[layer],
             expert_weights_buffer,
             ep_group,
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 494a4d3c33aa..df871dd7cbe4 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -161,6 +161,12 @@ def get_connector_class(
     "LMCacheConnectorV1",
 )
 
+KVConnectorFactory.register_connector(
+    "LMCacheMPConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_mp_connector",
+    "LMCacheMPConnector",
+)
+
 KVConnectorFactory.register_connector(
     "NixlConnector",
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector",
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 354aa9a87183..f85eb414b222 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -204,11 +204,18 @@ def _get_connector_metadata(self) -> KVConnectorMetadata:
         Returns:
             ConnectorMetadata: the connector metadata.
         """
-
         # Should only be called while set to valid metadata.
         assert self._connector_metadata is not None
         return self._connector_metadata
 
+    def has_connector_metadata(self) -> bool:
+        """Check whether the connector metadata is currently set.
+
+        Returns:
+            bool: True if connector metadata exists, False otherwise.
+        """
+        return self._connector_metadata is not None
+
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """
         Initialize with the KV caches. Useful for pre-registering the
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 575ab468be56..0c24a53fb754 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -136,6 +136,21 @@ def get_finished(
         """
         return self._lmcache_engine.get_finished(finished_req_ids)
 
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """
+        Get the set of block IDs that failed to load.
+
+        Returns:
+            Set of block IDs that encountered load errors.
+            Empty set if no load errors occurred.
+        """
+        method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None)
+        if callable(method):
+            return method()
+
+        # Fallback for older versions that don't support this method
+        return set()
+
     # ==============================
     # Scheduler-side methods
     # ==============================
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py
index 3c73a1c09e58..07e05cc8f893 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py
@@ -2,6 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-from . import vllm_v1_adapter
+from . import multi_process_adapter, vllm_v1_adapter
+from .multi_process_adapter import (
+    LMCacheMPSchedulerAdapter,
+    LMCacheMPWorkerAdapter,
+    LoadStoreOp,
+)
 
-__all__ = ["vllm_v1_adapter"]
+__all__ = [
+    "vllm_v1_adapter",
+    "multi_process_adapter",
+    "LMCacheMPSchedulerAdapter",
+    "LMCacheMPWorkerAdapter",
+    "LoadStoreOp",
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
new file mode 100644
index 000000000000..ab2eeed9f6b8
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -0,0 +1,379 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from itertools import islice
+from typing import Any
+
+import torch
+import zmq
+from lmcache.utils import _lmcache_nvtx_annotate, init_logger
+from lmcache.v1.multiprocess.custom_types import (
+    CudaIPCWrapper,
+    IPCCacheEngineKey,
+    KVCache,
+)
+from lmcache.v1.multiprocess.mq import MessageQueueClient, MessagingFuture
+from lmcache.v1.multiprocess.protocol import RequestType, get_response_class
+
+logger = init_logger(__name__)
+
+
+def wrap_kv_caches(kv_caches: dict[str, KVCache]) -> KVCache:
+    logger.info("KV caches keys are %s", list(kv_caches.keys()))
+    return [CudaIPCWrapper(tensor) for tensor in kv_caches.values()]
+
+
+def send_lmcache_request(
+    mq_client: MessageQueueClient,
+    request_type: RequestType,
+    payloads: list[Any],
+) -> MessagingFuture[Any]:
+    future = mq_client.submit_request(
+        request_type, payloads, get_response_class(request_type)
+    )
+    return future
+
+
+def get_lmcache_chunk_size(
+    mq_client: MessageQueueClient,
+) -> int:
+    future = send_lmcache_request(mq_client, RequestType.GET_CHUNK_SIZE, [])
+    chunk_size = future.result()
+    return chunk_size
+
+
+def striding_block_hashes(
+    block_hashes: list[bytes],
+    blocks_in_chunk,
+) -> Iterable[bytes]:
+    """Striding the block hashes to get the block hashes for each chunk.
+    For example, if blocks_in_chunk is 16, then we will get the block hashes
+    for the 16th, 32nd, 48th, ... blocks.
+    """
+    return islice(block_hashes, blocks_in_chunk - 1, None, blocks_in_chunk)
+
+
+@dataclass
+class LoadStoreOp:
+    block_hashes: list[bytes]
+    block_ids: list[int]
+
+    def __len__(self) -> int:
+        return len(self.block_hashes)
+
+    def __post_init__(self):
+        assert len(self.block_hashes) == len(self.block_ids), (
+            "The number of block hashes should be equal to the number of block ids "
+            f"But got {len(self.block_hashes)} and {len(self.block_ids)}"
+        )
+
+
+StoreResult = bool
+RetrieveResult = list[bool]
+LookupResult = list[bool]
+
+
+class LMCacheMPSchedulerAdapter:
+    def __init__(
+        self,
+        server_url: str,
+        context: zmq.Context,
+        model_name: str,
+        world_size: int,
+        kv_rank: int,
+        vllm_block_size: int,
+    ):
+        """
+        Args:
+            server_url: The server URL for the LMCache message queue
+            context: The ZMQ context
+
+            model_name: The model name used for LMCache keys
+            world_size: The world size used for LMCache keys
+            kv_rank: The kv rank used for LMCache keys
+            vllm_block_size: The block size used in vLLM
+        """
+        self.mq_client = MessageQueueClient(server_url, context)
+
+        # Request futures
+        self.lookup_futures: dict[str, MessagingFuture[LookupResult]] = {}
+
+        self.model_name = model_name
+        self.world_size = world_size
+        self.worker_id = kv_rank
+
+        # Read chunk size from lmcache
+        self.chunk_size = get_lmcache_chunk_size(self.mq_client)
+        assert self.chunk_size % vllm_block_size == 0, (
+            "LMCache chunk size should be a multiple of vLLM block size"
+        )
+        self.blocks_in_chunk = self.chunk_size // vllm_block_size
+
+    @_lmcache_nvtx_annotate
+    def maybe_submit_lookup_request(self, request_id: str, block_hashes: list[bytes]):
+        if request_id in self.lookup_futures:
+            # Skip if there is already a lookup request
+            return
+
+        s = striding_block_hashes(block_hashes, self.blocks_in_chunk)
+        keys = [self._create_key(block_hash) for block_hash in s]
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.LOOKUP,
+            [keys, True],
+        )
+        self.lookup_futures[request_id] = future
+
+    @_lmcache_nvtx_annotate
+    def check_lookup_result(self, request_id: str) -> int | None:
+        assert request_id in self.lookup_futures, (
+            f"Lookup request for request_id={request_id} has not been submitted"
+        )
+
+        future = self.lookup_futures[request_id]
+        if not future.query():
+            return None
+
+        result = future.result()
+        num_chunks = sum(result)
+        return num_chunks * self.chunk_size
+
+    def num_blocks_per_chunk(self) -> int:
+        """
+        Returns:
+            The number of vllm blocks in a LMCache data chunk
+        """
+        return self.blocks_in_chunk
+
+    # Helper functions
+    def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey:
+        """Convert a block hash to an IPC cache engine key"""
+        return IPCCacheEngineKey(
+            model_name=self.model_name,
+            world_size=self.world_size,
+            worker_id=self.worker_id,
+            chunk_hash=block_hash,
+        )
+
+
+class LMCacheMPWorkerAdapter:
+    def __init__(
+        self,
+        server_url: str,
+        context: zmq.Context,
+        model_name: str,
+        world_size: int,
+        kv_rank: int,
+        vllm_block_size: int,
+    ):
+        self.mq_client = MessageQueueClient(server_url, context)
+
+        # Instance id for GPU worker
+        self.instance_id = os.getpid()
+
+        # Registered kv caches from vLLM
+        self.kv_caches: dict[str, torch.Tensor] = {}
+
+        # Request futures
+        # request_id -> (future, other merged requests)
+        self.store_futures: dict[
+            str, tuple[MessagingFuture[StoreResult], list[str]]
+        ] = {}
+        self.retrieve_futures: dict[
+            str, tuple[MessagingFuture[RetrieveResult], list[str]]
+        ] = {}
+
+        self.finished_stores: set[str] = set()
+        self.previously_finished: set[str] = set()
+
+        self.model_name = model_name
+        self.world_size = world_size
+        self.worker_id = kv_rank
+
+        # Read chunk size from lmcache
+        chunk_size = get_lmcache_chunk_size(self.mq_client)
+        assert chunk_size % vllm_block_size == 0, (
+            "LMCache chunk size should be a multiple of vLLM block size"
+        )
+        self.blocks_in_chunk = chunk_size // vllm_block_size
+
+    def register_kv_caches(self, kv_caches: dict[str, KVCache]):
+        # Register kv cache and send the request
+        self.kv_caches = kv_caches
+        logger.info("Registering kv caches")
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.REGISTER_KV_CACHE,
+            [self.instance_id, wrap_kv_caches(kv_caches)],
+        )
+        future.result()
+
+    @_lmcache_nvtx_annotate
+    def submit_store_request(
+        self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event
+    ):
+        keys = self._block_hashes_to_keys(op.block_hashes)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.STORE,
+            [keys, self.instance_id, op.block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.store_futures[request_id] = (future, [])
+
+    @_lmcache_nvtx_annotate
+    def submit_retrieve_request(
+        self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event
+    ):
+        keys = self._block_hashes_to_keys(op.block_hashes)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.RETRIEVE,
+            [keys, self.instance_id, op.block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.retrieve_futures[request_id] = (future, [])
+
+    @_lmcache_nvtx_annotate
+    def batched_submit_store_requests(
+        self,
+        request_ids: list[str],
+        ops: list[LoadStoreOp],
+        event: torch.cuda.Event,
+    ):
+        keys = []
+        block_ids = []
+        for op in ops:
+            keys.extend(self._block_hashes_to_keys(op.block_hashes))
+            block_ids.extend(op.block_ids)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.STORE,
+            [keys, self.instance_id, block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.store_futures[request_ids[0]] = (future, request_ids[1:])
+
+    @_lmcache_nvtx_annotate
+    def batched_submit_retrieve_requests(
+        self,
+        request_ids: list[str],
+        ops: list[LoadStoreOp],
+        event: torch.cuda.Event,
+    ):
+        keys = []
+        block_ids = []
+        for op in ops:
+            keys.extend(self._block_hashes_to_keys(op.block_hashes))
+            block_ids.extend(op.block_ids)
+        future = send_lmcache_request(
+            self.mq_client,
+            RequestType.RETRIEVE,
+            [keys, self.instance_id, block_ids, event.ipc_handle()],
+        ).to_cuda_future()
+        self.retrieve_futures[request_ids[0]] = (future, request_ids[1:])
+
+    @_lmcache_nvtx_annotate
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        finished_stores = set()
+        finished_retrieves = set()
+        for request_id, (future, other_reqs) in self.store_futures.items():
+            if not future.query():
+                continue
+
+            result = future.result()
+            finished_stores.add(request_id)
+            finished_stores.update(other_reqs)
+
+            if not result:
+                # TODO: add error handling here
+                logger.error(
+                    "Something went wrong when processing the "
+                    "store request for request_id=%s",
+                    request_id,
+                )
+
+        for request_id, (future, other_reqs) in self.retrieve_futures.items():
+            if not future.query():
+                continue
+
+            result = future.result()
+            finished_retrieves.add(request_id)
+            finished_retrieves.update(other_reqs)
+
+            if not all(result):
+                # TODO: add error handing here
+                logger.error(
+                    "Something went wrong when processing the "
+                    "retrieve request for request_id=%s, result=%s",
+                    request_id,
+                    result,
+                )
+            logger.info("Retrieve request for request_id=%s finished", request_id)
+
+        # Remove the finished requests from the tracking dicts
+        for request_id in finished_stores:
+            self.store_futures.pop(request_id, None)
+        for request_id in finished_retrieves:
+            self.retrieve_futures.pop(request_id, None)
+
+        # Update the internal states
+        self.finished_stores.update(finished_stores)
+
+        ret_stores = set()
+        for req_id in finished_req_ids:
+            if req_id in self.finished_stores or req_id in self.store_futures:
+                self.previously_finished.add(req_id)
+            else:
+                ret_stores.add(req_id)
+
+        # Calculate the final finished stores
+        ret_stores.update(self._update_and_get_finished_store())
+
+        return ret_stores, finished_retrieves
+
+    def num_blocks_per_chunk(self) -> int:
+        """
+        Returns:
+            The number of vllm blocks in a LMCache data chunk
+        """
+        return self.blocks_in_chunk
+
+    def shutdown(self):
+        # Unregister kv cache
+        logger.info("Unregistering kv caches")
+        send_lmcache_request(
+            self.mq_client, RequestType.UNREGISTER_KV_CACHE, [self.instance_id]
+        ).result()
+
+        self.mq_client.close()
+
+    # Helper functions
+    def _update_and_get_finished_store(
+        self,
+    ) -> set[str]:
+        """Converge the internal states about finished stores
+        and returns the 'safe finished store request ids' back
+        """
+        safe_finished_s = self.finished_stores.intersection(self.previously_finished)
+        self.finished_stores.difference_update(self.previously_finished)
+        self.previously_finished.difference_update(safe_finished_s)
+
+        return safe_finished_s
+
+    def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey:
+        """Convert a block hash to an IPC cache engine key"""
+        return IPCCacheEngineKey(
+            model_name=self.model_name,
+            world_size=self.world_size,
+            worker_id=self.worker_id,
+            chunk_hash=block_hash,
+        )
+
+    def _block_hashes_to_keys(
+        self, block_hashes: list[bytes]
+    ) -> list[IPCCacheEngineKey]:
+        """Convert block hashes to IPC cache engine keys"""
+        s = striding_block_hashes(block_hashes, self.blocks_in_chunk)
+        return [self._create_key(block_hash) for block_hash in s]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
new file mode 100644
index 000000000000..55831dc56c80
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -0,0 +1,867 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import enum
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Literal, Optional, cast
+
+import torch
+import zmq
+from lmcache.utils import init_logger as lmcache_init_logger
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration import (
+    LMCacheMPSchedulerAdapter,
+    LMCacheMPWorkerAdapter,
+    LoadStoreOp,
+)
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import KVConnectorOutput
+from vllm.v1.utils import ConstantList
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.distributed.kv_events import KVCacheEvent
+    from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
+        KVConnectorPromMetrics,
+        KVConnectorStats,
+        PromMetric,
+        PromMetricT,
+    )
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.core.kv_cache_utils import BlockHash
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+logger = lmcache_init_logger(__name__)
+
+
+# Helper functions
+def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]:
+    if block_ids is None:
+        return []
+    assert isinstance(block_ids, tuple), (
+        f"Expected block_ids to be a tuple of lists, but got {type(block_ids)}"
+    )
+
+    if len(block_ids) > 1:
+        raise RuntimeError(
+            "LMCacheMPConnector only works without hybrid kv cache manager. "
+            "Please pass --disable-hybrid-kv-cache-manager when starting vllm"
+        )
+
+    return block_ids[0]
+
+
+def create_scheduler_adapter(
+    server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
+) -> LMCacheMPSchedulerAdapter:
+    # TODO: have a helper function to calculate the correct rank and
+    # world size for the MLA and other models
+    return LMCacheMPSchedulerAdapter(
+        server_url,
+        zmq_context,
+        vllm_config.model_config.model,
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config.cache_config.block_size,
+    )
+
+
+def create_worker_adapter(
+    server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
+) -> LMCacheMPWorkerAdapter:
+    # TODO: have a helper function to calculate the correct rank and
+    # world size for the MLA and other models
+    return LMCacheMPWorkerAdapter(
+        server_url,
+        zmq_context,
+        vllm_config.model_config.model,
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config.cache_config.block_size,
+    )
+
+
+def convert_block_hashes_to_bytes(
+    block_hashes: list["BlockHash"],
+) -> list[bytes]:
+    return cast(list[bytes], block_hashes)
+
+
+class LMCacheMPRequestState(enum.Enum):
+    """
+    State machine:
+    PREFETCHING -- update_state_after_alloc --> WAITING_FOR_LOAD
+    WAITING_FOR_LOAD -- process_loading_requests --> READY
+    """
+
+    PREFETCHING = enum.auto()
+    WAITING_FOR_LOAD = enum.auto()
+    READY = enum.auto()
+
+
+@dataclass
+class LMCacheMPRequestTracker:
+    # NOTE: this class used vLLM data structures, should be part of
+    # vLLM integration code
+
+    request_id: str
+
+    # Read-only lists to track the token ids and block hashes
+    all_token_ids: ConstantList[int]
+    block_hashes: ConstantList["BlockHash"]
+
+    # Block ids and hashes will be updated at update_states_after_alloc and
+    # during the generation
+    allocated_block_ids: list[int] = field(default_factory=list)
+
+    # Number of scheduled tokens in this request. We keep tracking this to
+    # avoid saving half-full blocks.
+    num_scheduled_tokens: int = 0
+
+    # Number of blocks stored will be initialized when lookup the external
+    # hit tokens and will be updated when processing new requests and cached
+    # requests.
+    num_stored_blocks: int = 0
+
+    # Staging load operation -- save vllm and lmcache hit tokens during lookup
+    num_vllm_hit_blocks: int = 0
+    num_lmcache_hit_blocks: int = 0
+
+    # Main state
+    state: LMCacheMPRequestState = LMCacheMPRequestState.PREFETCHING
+
+    def __init__(self, request: "Request"):
+        self.request_id = request.request_id
+        self.all_token_ids = request.all_token_ids
+        self.block_hashes = ConstantList(request.block_hashes)
+        self.allocated_block_ids = []
+        self.num_stored_blocks = 0
+        self.num_vllm_hit_blocks = 0
+        self.num_lmcache_hit_blocks = 0
+        self.state = LMCacheMPRequestState.PREFETCHING
+
+    ####
+    # Check the state of the request
+    ####
+    def needs_retrieve(self) -> bool:
+        """Check whether the current request needs retrieve, will be used
+        update_stage_after_alloc"""
+        return (
+            self.num_lmcache_hit_blocks > self.num_vllm_hit_blocks
+            and self.state != LMCacheMPRequestState.READY
+        )
+
+    def is_ready_for_retrieving(self) -> bool:
+        """Check whether the current request is ready for retrieving,
+        will be used in process_loading_requests"""
+        return (
+            self.state == LMCacheMPRequestState.WAITING_FOR_LOAD
+            and self.needs_retrieve()
+        )
+
+    ####
+    # Update internal states
+    ####
+    def increase_num_scheduled_tokens(self, num_new_tokens: int):
+        self.num_scheduled_tokens += num_new_tokens
+
+    def increase_num_stored_blocks(self, num_new_blocks: int):
+        """Increase the number of stored blocks for the current request
+        This function will be called when processing the cached requests.
+        """
+        self.num_stored_blocks += num_new_blocks
+
+    def update_block_ids(
+        self,
+        new_block_ids: list[int],
+    ):
+        """Update the block ids for the current request
+        This function will be called when processing the cached requests.
+        """
+        self.allocated_block_ids.extend(new_block_ids)
+
+    ####
+    # For debugging
+    ####
+    def __repr__(self) -> str:
+        return (
+            f"LMCacheMPRequestTracker(request_id={self.request_id}, "
+            f"num_tokens={len(self.all_token_ids)}, "
+            f"num_block_hashes={len(self.block_hashes)}, "
+            f"num_allocated_blocks={len(self.allocated_block_ids)}, "
+            f"num_stored_blocks={self.num_stored_blocks}, "
+            f"vllm_hit_blocks={self.num_vllm_hit_blocks}, "
+            f"lmcache_hit_blocks={self.num_lmcache_hit_blocks}, "
+            f"state={self.state})"
+        )
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+
+@dataclass
+class LMCacheMPRequestMetadata:
+    request_id: str
+    direction: Literal["STORE", "RETRIEVE"]
+    op: LoadStoreOp
+
+    @staticmethod
+    def GetStoreMetadata(
+        tracker: LMCacheMPRequestTracker,
+        blocks_in_chunk: int,
+        vllm_block_size: int,
+    ) -> "LMCacheMPRequestMetadata | None":
+        """
+        Generate the store metadata for the current request tracker.
+
+        Args:
+            tracker: The request tracker to generate the metadata from.
+            blocks_in_chunk: the number of blocks in a LMCache data chunk
+        """
+        # Store the blocks that has block hashes
+        # NOTE: the invariant here is that `num_stored_blocks` should
+        # always be a multiple of `blocks_in_chunk`
+        # TODO: This should be checked everytime we update the num_stored_blocks
+        min_available_blocks = min(
+            len(tracker.block_hashes),
+            len(tracker.allocated_block_ids),
+            tracker.num_scheduled_tokens // vllm_block_size,
+        )
+        num_staging_blocks = min_available_blocks - tracker.num_stored_blocks
+        num_chunks = num_staging_blocks // blocks_in_chunk
+
+        if num_chunks >= 1:
+            start = tracker.num_stored_blocks
+            end = start + num_chunks * blocks_in_chunk
+            block_hashes = convert_block_hashes_to_bytes(
+                tracker.block_hashes[start:end]
+            )
+            block_ids = tracker.allocated_block_ids[start:end]
+
+            ret = LMCacheMPRequestMetadata(
+                request_id=tracker.request_id,
+                direction="STORE",
+                op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids),
+            )
+
+            # Update the request tracker
+            tracker.increase_num_stored_blocks(end - start)
+            return ret
+
+        return None
+
+    @staticmethod
+    def GetRetrieveMetadata(
+        tracker: LMCacheMPRequestTracker,
+        blocks_in_chunk: int,
+    ) -> "LMCacheMPRequestMetadata | None":
+        """
+        Generate the retrieve metadata for the current request tracker.
+
+        Args:
+            tracker: The request tracker to generate the metadata from.
+            blocks_in_chunk: the number of blocks in a LMCache data chunk
+        """
+        if not tracker.is_ready_for_retrieving():
+            return None
+
+        # |---------------------|-----------------|----------------|
+        # | num_vllm_hit_blocks |
+        # | lmcache chunk 1   | lmcache chunk 2   |
+        #                     |  need to retrieve |
+
+        start = tracker.num_vllm_hit_blocks // blocks_in_chunk * blocks_in_chunk
+        end = tracker.num_lmcache_hit_blocks
+        assert end % blocks_in_chunk == 0, (
+            "The number of LMCache hit blocks should be a multiple of the "
+            "number of blocks in a lmcache chunk. "
+        )
+        assert len(tracker.block_hashes) >= end, (
+            "The number of block hashes should be greater than or equal to the "
+            "number of LMCache hit blocks. "
+        )
+        if end > start:
+            block_hashes = convert_block_hashes_to_bytes(
+                tracker.block_hashes[start:end]
+            )
+            block_ids = tracker.allocated_block_ids[start:end]
+
+            ret = LMCacheMPRequestMetadata(
+                request_id=tracker.request_id,
+                direction="RETRIEVE",
+                op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids),
+            )
+            return ret
+
+        return None
+
+
+class LMCacheMPConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        super().__init__()
+        self.requests: list[LMCacheMPRequestMetadata] = []
+
+    def add_request_metadata(self, request_metadata: LMCacheMPRequestMetadata):
+        self.requests.append(request_metadata)
+
+    def __len__(self):
+        return len(self.requests)
+
+    # For debugging
+    def __str__(self):
+        request_strs = []
+        for req_meta in self.requests:
+            request_strs.append(
+                f"RequestMetadata(request_id={req_meta.request_id}, "
+                f"direction={req_meta.direction}, "
+                f"num_blocks={len(req_meta.op)}, "
+                f"block_ids={req_meta.op.block_ids})"
+            )
+        return "[" + "\n".join(request_strs) + "]"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class LMCacheMPConnector(KVConnectorBase_V1):
+    """
+    The connector for LMCache multi-process mode.
+
+    Extra configs (kv_transfer_config.extra_config):
+    - lmcache.mp.host: the host of the LMCache server.
+    - lmcache.mp.port: the port of the LMCache server.
+    """
+
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+
+        assert vllm_config.kv_transfer_config is not None
+        server_host = vllm_config.kv_transfer_config.get_from_extra_config(
+            "lmcache.mp.host", "tcp://localhost"
+        )
+        server_port = vllm_config.kv_transfer_config.get_from_extra_config(
+            "lmcache.mp.port", 5555
+        )
+
+        server_url = f"{server_host}:{server_port}"
+        zmq_context = zmq.Context.instance()
+        if self.role == KVConnectorRole.SCHEDULER:
+            self.scheduler_adapter = create_scheduler_adapter(
+                server_url, zmq_context, vllm_config
+            )
+            self.request_trackers: dict[str, LMCacheMPRequestTracker] = {}
+        elif self.role == KVConnectorRole.WORKER:
+            self.worker_adapter = create_worker_adapter(
+                server_url, zmq_context, vllm_config
+            )
+        else:
+            raise ValueError(f"Unknown KVConnectorRole: {self.role}")
+
+        self.vllm_block_size = vllm_config.cache_config.block_size
+
+    @property
+    def role(self) -> KVConnectorRole:
+        return self._role
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    def _get_connector_metadata(self) -> KVConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+
+        # Should only be called while set to valid metadata.
+        assert self._connector_metadata is not None
+        return self._connector_metadata
+
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """
+        Initialize with the KV caches. Useful for pre-registering the
+        KV Caches in the KVConnector (e.g. for NIXL).
+
+        Args:
+            kv_caches: dictionary of layer names, kv cache
+        """
+        logger.info("Registering kv caches!")
+        self.worker_adapter.register_kv_caches(kv_caches)
+        return
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be
+            the same.
+
+        """
+        metadata = self._get_connector_metadata()
+        assert isinstance(metadata, LMCacheMPConnectorMetadata)
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        request_ids = []
+        ops = []
+
+        for meta in metadata.requests:
+            if meta.direction != "RETRIEVE":
+                continue
+            request_ids.append(meta.request_id)
+            ops.append(meta.op)
+
+        if len(request_ids) > 0:
+            logger.info(
+                "HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids
+            )
+            self.worker_adapter.batched_submit_retrieve_requests(
+                request_ids, ops, event
+            )
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        return
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: "AttentionMetadata",
+        **kwargs: Any,
+    ) -> None:
+        """
+        Start saving a layer of KV cache from vLLM's paged buffer
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        return
+
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        metadata = self._get_connector_metadata()
+        assert isinstance(metadata, LMCacheMPConnectorMetadata)
+
+        with torch.cuda.stream(torch.cuda.current_stream()):
+            event = torch.cuda.Event(interprocess=True)
+            event.record()
+
+        request_ids = []
+        ops = []
+        for meta in metadata.requests:
+            if meta.direction != "STORE":
+                continue
+            request_ids.append(meta.request_id)
+            ops.append(meta.op)
+
+        if len(request_ids) > 0:
+            self.worker_adapter.batched_submit_store_requests(request_ids, ops, event)
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens on the worker.
+        The scheduler process (via the Executors) will use this output
+        to track which workers are done.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        val = self.worker_adapter.get_finished(finished_req_ids)
+        # logger.error("Finished req ids: %s, %s", val[0], val[1])
+        return val
+
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """
+        Get the set of block IDs that failed to load.
+
+        Returns:
+            Set of block IDs that encountered load errors.
+            Empty set if no load errors occurred.
+
+        Notes:
+            - Applies to both sync- and async-loading requests.
+            - Async loading: failed blocks may be reported in any forward pass
+              up to and including the pass where the request ID is returned by
+              `get_finished()`. Even if failures occur, the request must still
+              be reported via `get_finished()`, and the failed block IDs must
+              appear here no later than that same pass.
+            - Sync loading: failed blocks should be reported in the forward
+              pass in which they are detected.
+        """
+        # TODO: add error tracking
+        return set()
+
+    def shutdown(self):
+        """
+        Shutdown the connector. This is called when the worker process
+        is shutting down to ensure that all the async operations are
+        completed and the connector is cleaned up properly.
+        """
+        if hasattr(self, "worker_adapter"):
+            self.worker_adapter.shutdown()
+        return None
+
+    def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]:
+        """
+        Get the KV connector stats collected during the last interval.
+        """
+        return None
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            A tuple with the following elements:
+                - An optional number of tokens that can be loaded from the
+                  external KV cache beyond what is already computed.
+                  If None, it means that the connector needs more time to
+                  determine the number of matched tokens, and the scheduler
+                  should query for this request again later.
+                - `True` if external KV cache tokens will be loaded
+                  asynchronously (between scheduler steps). Must be
+                  'False' if the first element is 0.
+
+        Notes:
+            The connector should only consider the largest prefix of prompt-
+            tokens for which KV cache is actually available at the time of the
+            call. If the cache cannot be loaded for some tokens (e.g., due to
+            connectivity issues or eviction), those tokens must not be taken
+            into account.
+        """
+        tracker = self._get_or_create_request_tracker(request)
+
+        self.scheduler_adapter.maybe_submit_lookup_request(
+            request.request_id, convert_block_hashes_to_bytes(request.block_hashes)
+        )
+
+        ret = self.scheduler_adapter.check_lookup_result(request.request_id)
+        if ret is None:
+            return None, True
+
+        if ret == 0:
+            return 0, False
+
+        assert (
+            ret % (self.scheduler_adapter.num_blocks_per_chunk() * self.vllm_block_size)
+            == 0
+        )
+
+        # Update num stored blocks for the tracker
+        num_vllm_blocks = num_computed_tokens // self.vllm_block_size
+        num_lmcache_blocks = ret // self.vllm_block_size
+        tracker.increase_num_stored_blocks(num_lmcache_blocks)
+
+        # Save the vllm and lmcache hit tokens
+        tracker.num_vllm_hit_blocks = num_vllm_blocks
+        tracker.num_lmcache_hit_blocks = num_lmcache_blocks
+
+        need_to_load = max(0, ret - num_computed_tokens)
+        logger.debug(
+            "vLLM hit is: %d, Need to load is %d", num_computed_tokens, need_to_load
+        )
+        return need_to_load, need_to_load > 0
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        """
+        Update KVConnector state after block allocation.
+
+        If get_num_new_matched_tokens previously returned True for a
+        request, this function may be called twice for that same request -
+        first when blocks are allocated for the connector tokens to be
+        asynchronously loaded into, and second when any additional blocks
+        are allocated, after the load/transfer is complete.
+
+        Args:
+            request (Request): the request object.
+            blocks (KVCacheBlocks): the blocks allocated for the request.
+            num_external_tokens (int): the number of tokens that will be
+                loaded from the external KV cache.
+        """
+        # NOTE: the `blocks` are NEW BLOCKS allocated for this request.
+        tracker = self._get_request_tracker(request.request_id)
+        block_ids = reformat_block_ids(blocks.get_block_ids())
+
+        # No matter we need to retrieve or not, we need to update
+        # the block ids into the tracker
+        tracker.update_block_ids(block_ids)
+
+        # Update the state of the tracker
+        condition = tracker.needs_retrieve()
+        if tracker.state == LMCacheMPRequestState.PREFETCHING:
+            # If need to retrieve, change to WAITING_FOR_LOAD
+            # Otherwise, change to READY
+            tracker.state = (
+                LMCacheMPRequestState.WAITING_FOR_LOAD
+                if condition
+                else LMCacheMPRequestState.READY
+            )
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        metadata = LMCacheMPConnectorMetadata()
+
+        self._process_retrieve_requests(metadata)
+        self._process_new_requests(scheduler_output, metadata)
+        self._process_cached_requests(scheduler_output, metadata)
+
+        if len(metadata) > 0:
+            logger.debug("Final connector metadata: %s", metadata)
+
+        return metadata
+
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        """
+        Update KVConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (KVConnectorOutput): the worker-side
+                connectors output.
+        """
+        return
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called exactly once when a request has finished, before its blocks are
+        freed.
+
+        The connector may assumes responsibility for freeing the blocks
+        asynchronously by returning True.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+        return True, None
+
+    def take_events(self) -> Iterable["KVCacheEvent"]:
+        """
+        Take the KV cache events from the connector.
+
+        Yields:
+            New KV cache events since the last call.
+        """
+        return ()
+
+    @classmethod
+    def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+
+        if cls is KVConnectorBase_V1:
+            raise TypeError(
+                "get_required_kvcache_layout should not be called "
+                "on the abstract base class"
+            )
+        return None
+
+    def get_finished_count(self) -> int | None:
+        """
+        Get the count of requests expected to complete send/receive operations
+        via this connector. This method is used to initialize the
+        KVOutputAggregator, overwriting the default world_size.
+
+        Returns:
+            int: expected sending or receiving completion count.
+        """
+        return None
+
+    @classmethod
+    def build_kv_connector_stats(
+        cls, data: dict[str, Any] | None = None
+    ) -> Optional["KVConnectorStats"]:
+        """
+        KVConnectorStats resolution method. This method allows dynamically
+        registered connectors to return their own KVConnectorStats object,
+        which can implement custom aggregation logic on the data dict.
+        """
+        return None
+
+    @classmethod
+    def build_prom_metrics(
+        cls,
+        vllm_config: "VllmConfig",
+        metric_types: dict[type["PromMetric"], type["PromMetricT"]],
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[str]],
+    ) -> Optional["KVConnectorPromMetrics"]:
+        """
+        Create a KVConnectorPromMetrics subclass which should register
+        per-connector Prometheus metrics and implement observe() to
+        expose connector transfer stats via Prometheus.
+        """
+        return None
+
+    ##############################
+    # Helper functions
+    ##############################
+    def _process_retrieve_requests(
+        self,
+        metadata: LMCacheMPConnectorMetadata,
+    ) -> None:
+        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+
+        for request_tracker in self.request_trackers.values():
+            if request_tracker.state != LMCacheMPRequestState.WAITING_FOR_LOAD:
+                continue
+            r_metadata = LMCacheMPRequestMetadata.GetRetrieveMetadata(
+                request_tracker, blocks_per_chunk
+            )
+            if r_metadata is not None:
+                metadata.add_request_metadata(r_metadata)
+            request_tracker.state = LMCacheMPRequestState.READY
+
+    def _process_new_requests(
+        self,
+        scheduler_output: SchedulerOutput,
+        metadata: LMCacheMPConnectorMetadata,
+    ) -> None:
+        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+
+        for new_request in scheduler_output.scheduled_new_reqs:
+            request_tracker = self._get_request_tracker(new_request.req_id)
+
+            num_new_tokens = scheduler_output.num_scheduled_tokens[new_request.req_id]
+            request_tracker.increase_num_scheduled_tokens(num_new_tokens)
+
+            r_meta = LMCacheMPRequestMetadata.GetStoreMetadata(
+                request_tracker, blocks_per_chunk, self.vllm_block_size
+            )
+            if r_meta is not None:
+                metadata.add_request_metadata(r_meta)
+
+    def _process_cached_requests(
+        self,
+        scheduler_output: SchedulerOutput,
+        metadata: LMCacheMPConnectorMetadata,
+    ) -> None:
+        blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk()
+
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for idx, request_id in enumerate(cached_reqs.req_ids):
+            request_tracker = self._get_request_tracker(request_id)
+
+            # Update block ids
+            new_block_ids = reformat_block_ids(cached_reqs.new_block_ids[idx])
+            request_tracker.update_block_ids(new_block_ids)
+
+            # Update new scheduled tokens
+            num_new_tokens = cached_reqs.num_computed_tokens[idx]
+            request_tracker.increase_num_scheduled_tokens(num_new_tokens)
+
+            r_meta = LMCacheMPRequestMetadata.GetStoreMetadata(
+                request_tracker, blocks_per_chunk, self.vllm_block_size
+            )
+
+            if r_meta is not None:
+                metadata.add_request_metadata(r_meta)
+
+    def _get_request_tracker(self, request_id: str) -> LMCacheMPRequestTracker:
+        assert request_id in self.request_trackers, (
+            f"Request tracker for request_id {request_id} not found. "
+        )
+        return self.request_trackers[request_id]
+
+    def _get_or_create_request_tracker(
+        self, request: "Request"
+    ) -> LMCacheMPRequestTracker:
+        request_id = request.request_id
+        if request_id not in self.request_trackers:
+            new_tracker = LMCacheMPRequestTracker(request)
+            self.request_trackers[request_id] = new_tracker
+        return self.request_trackers[request_id]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index d7bbf02c8367..c9d08e9b78ed 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -171,16 +171,22 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
     # We must override the base class method here because we need to bind
     # the metadata to each connector in the order of the connectors in the
     # MultiKVConnectorMetadata.
+    #
+    # Note: Call the base class method to ensure metadata is also set on the
+    # MultiConnector instance itself; otherwise, `has_connector_metadata()` will
+    # always return False.
     def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
         assert isinstance(connector_metadata, MultiKVConnectorMetadata)
         if connector_metadata.extra_async_saves:
             self._extra_async_saves.update(connector_metadata.extra_async_saves)
         for c, cm in zip(self._connectors, connector_metadata.metadata):
             c.bind_connector_metadata(cm)
+        super().bind_connector_metadata(connector_metadata)
 
     def clear_connector_metadata(self) -> None:
         for c in self._connectors:
             c.clear_connector_metadata()
+        super().clear_connector_metadata()
 
     def shutdown(self):
         exception: Exception | None = None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index ff9770b72bd3..5ff95876ef34 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -21,7 +21,8 @@
 import zmq
 
 from vllm import envs
-from vllm.attention.backends.registry import _Backend, backend_name_to_enum
+from vllm.attention import AttentionBackend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@@ -48,6 +49,7 @@
 from vllm.utils.network_utils import make_zmq_path, make_zmq_socket
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
@@ -90,6 +92,7 @@
     ),
     "tpu": ("cpu",),
     "xpu": ("cpu",),
+    "cpu": ("cpu",),
 }
 # support for oot platform by providing mapping in current_platform
 _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices())
@@ -105,11 +108,14 @@ class NixlAgentMetadata(KVConnectorHandshakeMetadata):
     block_lens: list[int]
     attn_backend_name: str
     kv_cache_layout: str
+    block_size: int
 
 
 @dataclass
 class ReqMeta:
     local_block_ids: list[int]
+    # To be used when logical block size does not match the kernel block size
+    local_physical_block_ids: list[int]
     remote_block_ids: list[int]
     remote_host: str
     remote_port: int
@@ -137,6 +143,7 @@ def add_new_req(
         assert load_remote_cache ^ save_to_host
         _req = ReqMeta(
             local_block_ids=local_block_ids,
+            local_physical_block_ids=local_block_ids,
             remote_block_ids=kv_transfer_params["remote_block_ids"],
             remote_engine_id=kv_transfer_params["remote_engine_id"],
             remote_host=kv_transfer_params["remote_host"],
@@ -347,7 +354,13 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
             + vllm_config.parallel_config.data_parallel_rank
         )
         assert vllm_config.kv_transfer_config is not None
-        self.use_host_buffer = vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
+        if current_platform.device_type == "cpu":
+            self.use_host_buffer = False
+        else:
+            self.use_host_buffer = (
+                vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
+            )
+
         logger.info("Initializing NIXL Scheduler %s", engine_id)
 
         # Background thread for handling new handshake requests.
@@ -664,11 +677,47 @@ class TpKVTopology:
         mapping between local and remote TP workers.
         """
 
-        tp_size: int
         tp_rank: int
         remote_tp_size: dict[EngineId, int]
         is_mla: bool
         total_num_kv_heads: int
+        attn_backend: type[AttentionBackend]
+        engine_id: EngineId
+        remote_block_size: dict[EngineId, int]
+
+        def __post_init__(self):
+            # Figure out whether the first dimension of the cache is K/V
+            # or num_blocks. This is used to register the memory regions correctly.
+            kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+                num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
+            )
+            # Non-MLA backends caches have 5 dims [2, num_blocks, H,N,D],
+            # we just mock num_blocks to 1 for the dimension check below.
+            self._is_kv_layout_blocks_first = (
+                len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
+            )
+
+            attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
+            self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
+
+        @property
+        def is_kv_layout_blocks_first(self) -> bool:
+            return self._is_kv_layout_blocks_first
+
+        @property
+        def split_k_and_v(self) -> bool:
+            # Whether to register regions for K and V separately (when present).
+            return not (
+                self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first
+            )
+
+        @property
+        def tp_size(self) -> int:
+            return self.remote_tp_size[self.engine_id]
+
+        @property
+        def block_size(self) -> int:
+            return self.remote_block_size[self.engine_id]
 
         def tp_ratio(
             self,
@@ -686,6 +735,19 @@ def tp_ratio(
             )
             return self.tp_size // remote_tp_size
 
+        def block_size_ratio(
+            self,
+            remote_block_size: int,
+        ) -> float:
+            """
+            Calculate the block size ratio between local and remote TP.
+            """
+            assert self.block_size % remote_block_size == 0, (
+                f"Local block size {self.block_size} is not divisible "
+                f"by remote block size {remote_block_size} or vice versa."
+            )
+            return self.block_size // remote_block_size
+
         def tp_ratio_from_engine_id(
             self,
             remote_engine_id: EngineId,
@@ -693,6 +755,13 @@ def tp_ratio_from_engine_id(
             remote_tp_size = self.remote_tp_size[remote_engine_id]
             return self.tp_ratio(remote_tp_size)
 
+        def block_size_ratio_from_engine_id(
+            self,
+            remote_engine_id: EngineId,
+        ) -> float:
+            remote_block_size = self.remote_block_size[remote_engine_id]
+            return self.block_size_ratio(remote_block_size)
+
         def is_kv_replicated(self, engine_id: EngineId) -> bool:
             """
             Whether the KV cache is replicated across TP workers due to the
@@ -792,7 +861,11 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         # cpu kv buffer for xfer
         # used when device memory can not be registered under nixl
         self.host_xfer_buffers: dict[str, torch.Tensor] = {}
-        self.use_host_buffer = self.kv_buffer_device == "cpu"
+        if self.device_type == "cpu":
+            self.use_host_buffer = False
+        else:
+            self.use_host_buffer = self.kv_buffer_device == "cpu"
+
         # support for oot platform which can't register nixl memory
         # type based on kv_buffer_device
         nixl_memory_type = current_platform.get_nixl_memory_type()
@@ -823,6 +896,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
 
         # nixl_prepped_dlist_handle.
         self.src_xfer_side_handle: int = 0
+        self.src_xfer_side_handles: dict[int, int] = {}
         # Map of engine_id -> nixl_prepped_dlist_handle (int)].
         self.dst_xfer_side_handles: dict[EngineId, int] = {}
 
@@ -876,27 +950,29 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
             use_mla=self.use_mla,
         )
         self.backend_name = backend.get_name()
-        attn_backend = backend_name_to_enum(self.backend_name)
-        self._use_flashinfer = attn_backend == _Backend.FLASHINFER
-        self._use_pallas = attn_backend == _Backend.PALLAS
         self.kv_cache_layout = get_kv_cache_layout()
         self.host_buffer_kv_cache_layout = self.kv_cache_layout
         logger.debug("Detected attention backend %s", self.backend_name)
         logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
 
         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
+        self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
         # finish reading before safely freeing the blocks.
         self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
         self.xfer_stats = NixlKVConnectorStats()
 
         self.kv_topo = self.TpKVTopology(
-            tp_size=self.world_size,
             tp_rank=self.tp_rank,
+            engine_id=self.engine_id,
             remote_tp_size=self._tp_size,  # shared state
+            remote_block_size=self._block_size,  # shared state
             is_mla=self.use_mla,
             total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
+            attn_backend=backend,
         )
+        self._use_pallas = self.kv_topo._use_pallas
+        self._physical_blocks_per_logical_kv_block = 1
 
     def _nixl_handshake(
         self,
@@ -944,9 +1020,13 @@ def _nixl_handshake(
                 )
 
             # Register Remote agent.
+            assert metadata.block_size <= self.block_size, (
+                "nP > nD is not supported yet."
+            )
             remote_agent_name = self.add_remote_agent(
                 metadata, p_remote_rank, remote_tp_size
             )
+
             setup_agent_time = time.perf_counter()
             logger.debug(
                 "NIXL handshake: add agent took: %s",
@@ -994,6 +1074,9 @@ def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
         # Set a no-op if the host buffer is not cpu.
         if self.kv_buffer_device != "cpu":
             return
+        # Set a no-op if self.device_type is 'cpu'.
+        if self.device_type == "cpu":
+            return
         assert self.use_host_buffer
         self.copy_blocks = copy_operation
 
@@ -1076,7 +1159,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         # (roughly 8KB vs 5KB).
         # Conversely for FlashInfer, K and V are registered in the same region
         # to better exploit the memory layout (ie num_blocks is the first dim).
-        split_k_and_v = not (self.use_mla or self._use_pallas or self._use_flashinfer)
+        split_k_and_v = self.kv_topo.split_k_and_v
         tensor_size_bytes = None
         # Enable different block lengths for different layers when MLA is used.
         self.block_len_per_layer = list[int]()
@@ -1092,6 +1175,23 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
                 if base_addr in seen_base_addresses:
                     continue
 
+                # TODO (NickLucche): Get kernel_block_size in a cleaner way
+                # NHD default "view" for non-MLA cache
+                kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3]
+
+                if self.block_size != kernel_block_size:
+                    logger.info_once(
+                        "User-specified logical block size (%s) does not match"
+                        " physical kernel block size (%s). Using the latter. ",
+                        self.block_size,
+                        kernel_block_size,
+                    )
+                    self._physical_blocks_per_logical_kv_block = (
+                        self.block_size // kernel_block_size
+                    )
+                    self.block_size = kernel_block_size
+                    self._block_size[self.engine_id] = kernel_block_size
+
                 seen_base_addresses.append(base_addr)
                 curr_tensor_size_bytes = cache.numel() * cache.element_size()
 
@@ -1141,7 +1241,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
 
         self.device_kv_caches = kv_caches
         self.dst_num_blocks[self.engine_id] = self.num_blocks
-        if self._use_flashinfer:
+        if self.kv_topo.is_kv_layout_blocks_first:
             for i in range(len(self.slot_size_per_layer)):
                 assert self.slot_size_per_layer[i] % 2 == 0
                 self.slot_size_per_layer[i] //= 2
@@ -1155,43 +1255,10 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             self.num_regions *= 2
 
         # Register local/src descr for NIXL xfer.
-        blocks_data = []
-        for i, base_addr in enumerate(seen_base_addresses):
-            kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
-            # NOTE With heter-TP, more blocks are prepared than what are
-            # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
-            # could create fewer, but then _get_block_descs_ids needs to
-            # select agent_meta.num_blocks instead of self.num_blocks for
-            # local descr, and that makes handling regular flow less clean.
-            for block_id in range(self.num_blocks):
-                block_offset = block_id * self.block_len_per_layer[i]
-                addr = base_addr + block_offset
-                # (addr, len, device id)
-                blocks_data.append((addr, kv_block_len, self.device_id))
-
-            if self._use_flashinfer:
-                # Separate and interleave K/V regions to maintain the same
-                # descs ordering. This is needed for selecting contiguous heads
-                # when split across TP ranks.
-                for block_id in range(self.num_blocks):
-                    block_offset = block_id * self.block_len_per_layer[i]
-                    addr = base_addr + block_offset
-                    # Register addresses for V cache (K registered first).
-                    v_addr = addr + kv_block_len
-                    blocks_data.append((v_addr, kv_block_len, self.device_id))
-        logger.debug(
-            "Created %s blocks for src engine %s and rank %s on device id %s",
-            len(blocks_data),
-            self.engine_id,
-            self.tp_rank,
-            self.device_id,
-        )
+        self.seen_base_addresses = seen_base_addresses
+        self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size)
 
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
-        # NIXL_INIT_AGENT to be used for preparations of local descs.
-        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
-            "NIXL_INIT_AGENT", descs
-        )
+        self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle
 
         # TODO(mgoin): Hybrid memory allocator is currently disabled for
         # models with local attention (Llama 4). Can remove this once enabled.
@@ -1227,8 +1294,62 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             kv_cache_layout=self.kv_cache_layout
             if not self.use_host_buffer
             else self.host_buffer_kv_cache_layout,
+            block_size=self.block_size,
         )
 
+    def register_local_xfer_handler(
+        self,
+        block_size: int,
+    ) -> int:
+        """
+        Function used for register local xfer handler with local block_size or
+        Remote block_size.
+
+        When local block_size is same as remote block_size, we use local block_size
+        to register local_xfer_handler during init.
+
+        When remote block size is less than local block size, we need to use
+        register another local_xfer_handler using remote block len to ensure
+        data copy correctness.
+        """
+        block_size_ratio = self.block_size // block_size
+        blocks_data = []
+        for i, base_addr in enumerate(self.seen_base_addresses):
+            # The new block_len is using prefill block_len;
+            # and num_blocks is multiple with N
+            kv_block_len = (
+                self.get_backend_aware_kv_block_len(layer_idx=i) // block_size_ratio
+            )
+            block_len_per_layer = self.block_len_per_layer[i] // block_size_ratio
+            num_blocks = self.num_blocks * block_size_ratio
+            for block_id in range(num_blocks):
+                block_offset = block_id * block_len_per_layer
+                addr = base_addr + block_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, kv_block_len, self.device_id))
+
+            if self.kv_topo.is_kv_layout_blocks_first:
+                # Separate and interleave K/V regions to maintain the same
+                # descs ordering. This is needed for selecting contiguous heads
+                # when split across TP ranks.
+                for block_id in range(num_blocks):
+                    block_offset = block_id * block_len_per_layer
+                    addr = base_addr + block_offset
+                    # Register addresses for V cache (K registered first).
+                    v_addr = addr + kv_block_len
+                    blocks_data.append((v_addr, kv_block_len, self.device_id))
+        logger.debug(
+            "Created %s blocks for src engine %s and rank %s on device id %s",
+            len(blocks_data),
+            self.engine_id,
+            self.tp_rank,
+            self.device_id,
+        )
+
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
+        # NIXL_INIT_AGENT to be used for preparations of local descs.
+        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+
     def add_remote_agent(
         self,
         nixl_agent_meta: NixlAgentMetadata,
@@ -1287,6 +1408,8 @@ def add_remote_agent(
         ### Register remote agent metadata
         if engine_id not in self._tp_size:
             self._tp_size[engine_id] = remote_tp_size
+        if engine_id not in self._block_size:
+            self._block_size[engine_id] = nixl_agent_meta.block_size
 
         remote_agent_name = self.nixl_wrapper.add_remote_agent(
             nixl_agent_meta.agent_metadata
@@ -1297,6 +1420,13 @@ def add_remote_agent(
 
         # Create dst descs and xfer side handles. TP workers have same #blocks
         # so we only register once per engine_id.
+        # Example:
+        # block_size_ratio > 1:
+        # remote:               | 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|
+        # local origin:|          0|          1|          8|         12|
+        # local mapped:| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(engine_id)
+
         if engine_id not in self.dst_num_blocks:
             self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
@@ -1319,8 +1449,14 @@ def add_remote_agent(
         # Register all remote blocks, but only the corresponding kv heads.
         for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
             kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
+            remote_kv_block_len = kv_block_len // block_size_ratio
+            if block_size_ratio > 1:
+                # using remote kv_block_len as transfer unit
+                kv_block_len = remote_kv_block_len
             rank_offset = (
-                self.tp_rank % tp_ratio * kv_block_len if not replicates_kv_cache else 0
+                self.tp_rank % tp_ratio * remote_kv_block_len
+                if not replicates_kv_cache
+                else 0
             )
             for block_id in range(nixl_agent_meta.num_blocks):
                 block_offset = block_id * nixl_agent_meta.block_lens[i]
@@ -1331,7 +1467,7 @@ def add_remote_agent(
                 # (addr, len, device id)
                 blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id))
 
-            if self._use_flashinfer:
+            if self.kv_topo.is_kv_layout_blocks_first:
                 # With FlashInfer index V separately to allow head splitting.
                 for block_id in range(nixl_agent_meta.num_blocks):
                     block_offset = block_id * nixl_agent_meta.block_lens[i]
@@ -1355,6 +1491,13 @@ def add_remote_agent(
             remote_agent_name, descs
         )
 
+        if block_size_ratio > 1:
+            # when prefill with smaller block_size, we need to init a
+            # new handler with same block_len to match
+            self.src_xfer_side_handles[nixl_agent_meta.block_size] = (
+                self.register_local_xfer_handler(nixl_agent_meta.block_size)
+            )
+
         return remote_agent_name
 
     def _validate_remote_agent_handshake(
@@ -1371,6 +1514,9 @@ def _validate_remote_agent_handshake(
         assert nixl_agent_meta.attn_backend_name == self.backend_name
 
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
+            remote_engine_id
+        )
         assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
         assert not self._use_pallas or tp_ratio == 1, (
             "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
@@ -1401,33 +1547,26 @@ def _validate_remote_agent_handshake(
         remote_block_len = nixl_agent_meta.block_lens[0]
         if self.use_mla or self.kv_topo.is_kv_replicated(remote_engine_id):
             # With replicated KV cache, only the number of blocks can differ.
-            assert self.block_len_per_layer == nixl_agent_meta.block_lens, (
-                "KV cache sizes must match between P and D when replicated"
-            )
-            remote_block_size = remote_block_len // (self.slot_size_per_layer[0])
+            for i in range(len(self.block_len_per_layer)):
+                assert (
+                    self.block_len_per_layer[i] // block_size_ratio
+                    == nixl_agent_meta.block_lens[i]
+                ), "KV cache sizes must match between P and D when replicated"
         else:
             # When MLA is not used, this is a list of the same block length
             for block_len in nixl_agent_meta.block_lens:
                 assert block_len == remote_block_len, (
                     "All remote layers must have the same block size"
                 )
-            remote_block_size = remote_block_len // (
-                self.slot_size_per_layer[0] * tp_ratio
-            )
-            if self._use_flashinfer:
-                # With flashinfer, KV are sent in the same message.
-                remote_block_size //= 2
 
-            assert remote_block_len == self.block_len_per_layer[0] * tp_ratio, (
+            assert (
+                remote_block_len
+                == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
+            ), (
                 "Remote P worker KV layer cache must be of shape [2, N, "
                 "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
             )
 
-        assert self.block_size == remote_block_size, (
-            "Remote P worker with different page/block size is not supported "
-            f"{self.block_size=}, {remote_block_size=}"
-        )
-
         # TP workers have same #blocks.
         assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks
 
@@ -1438,7 +1577,7 @@ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
         assert self.use_host_buffer
         assert self.copy_blocks is not None
 
-        local_block_ids = meta.local_block_ids
+        local_block_ids = meta.local_physical_block_ids
         self.copy_blocks(
             self.host_xfer_buffers,
             self.device_kv_caches,
@@ -1451,7 +1590,7 @@ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
                 "synced recved kv of request[%s] to device kv buffer,"
                 "local_block_ids: %s. ",
                 req_id,
-                ",".join(map(str, meta.local_block_ids)),
+                ",".join(map(str, local_block_ids)),
             )
 
     def save_kv_to_host(self, metadata: NixlConnectorMetadata):
@@ -1460,19 +1599,22 @@ def save_kv_to_host(self, metadata: NixlConnectorMetadata):
         assert self.copy_blocks is not None
 
         for req_id, meta in metadata.reqs_to_save.items():
+            meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
+                meta.local_block_ids
+            )
             if logger.isEnabledFor(logging.DEBUG):
                 logger.debug(
                     "save_load_kv for request[%s] to host xfer buffer."
                     "local_block_ids: %s. ",
                     req_id,
-                    ",".join(map(str, meta.local_block_ids)),
+                    ",".join(map(str, meta.local_physical_block_ids)),
                 )
             # blocking
             self.copy_blocks(
                 self.device_kv_caches,
                 self.host_xfer_buffers,
-                meta.local_block_ids,
-                meta.local_block_ids,
+                meta.local_physical_block_ids,
+                meta.local_physical_block_ids,
                 "d2h",
             )
 
@@ -1494,7 +1636,7 @@ def permute_device_kv(self, block_ids: list[int]):
         - cache.index_copy_(0, indices, permuted_blocks) # copy permuted kv back
 
         """
-        split_k_and_v = not (self.use_mla or self._use_pallas or self._use_flashinfer)
+        split_k_and_v = self.kv_topo.split_k_and_v
         inv_order = [0, 2, 1, 3]
         sample_cache = list(self.device_kv_caches.values())[0][0]
         target_shape = list(sample_cache.shape)
@@ -1511,6 +1653,56 @@ def permute_device_kv(self, block_ids: list[int]):
                 )
                 cache.index_copy_(0, indices, permuted_blocks)
 
+    def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]):
+        def _process_local_gt_remote(blocks_to_update, block_size_ratio):
+            n_kv_heads, block_size, head_size = blocks_to_update.shape[1:]
+            remote_block_size = block_size // block_size_ratio
+            n_blocks = block_size_ratio
+            # actual permute is to convert
+            # for local blocksize > remote blocksize
+            # ex: local blocksize = 16 tokens, remote blocksize = 4 tokens
+            # local block[0] = remote block[0, 1, 2, 3]
+            # remote is |h0-b0|h1-b0|h2-b0|h3-b0|h0-b1|h1-b1|h2-b1|h3-b1|...
+            # local is  |h0-b0..................|h1-b0..................|...
+            # permute is to:
+            # 1. view => view remote as n_blocks * remote_shape(H,remoteN,D)
+            # 2. permute => (H, nblocks, remoteN, D)
+            # 3. flatten => (H, localN, D)
+            permuted_blocks = (
+                blocks_to_update.reshape(
+                    -1, n_blocks, n_kv_heads, remote_block_size, head_size
+                )
+                .permute(0, 2, 1, 3, 4)
+                .flatten(2, 3)
+            )
+            return permuted_blocks
+
+        if len(self.device_kv_caches) == 0:
+            return
+        split_k_and_v = not (
+            self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first
+        )
+        sample_cache = list(self.device_kv_caches.values())[0][0]
+        for block_size_ratio, block_ids_list in block_ids_per_ratio.items():
+            assert block_size_ratio > 1, "Only nP < nD supported currently."
+            block_ids_list = [[item for sublist in block_ids_list for item in sublist]]
+
+            for block_ids in block_ids_list:
+                indices = torch.tensor(block_ids, device=sample_cache.device)
+
+                for _, cache_or_caches in self.device_kv_caches.items():
+                    cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
+                    for cache in cache_list:
+                        blocks_to_update = cache.index_select(0, indices)
+                        # because kv_cache is always using original layout NHD as
+                        # virtual shape while stride can be either HND / NHD at
+                        # initialization.
+                        # we need to firstly get physical view of the tensor
+                        permuted_blocks = _process_local_gt_remote(
+                            blocks_to_update.permute(0, 2, 1, 3), block_size_ratio
+                        ).permute(0, 2, 1, 3)
+                        cache.index_copy_(0, indices, permuted_blocks)
+
     def get_finished(self) -> tuple[set[str], set[str]]:
         """
         Get requests that are done sending or recving on this specific worker.
@@ -1534,6 +1726,7 @@ def get_finished(self) -> tuple[set[str], set[str]]:
             )
 
         block_ids_to_permute = []
+        block_ids_for_blocksize_post_process = defaultdict(list)
         for req_id in done_recving:
             # clean up metadata for completed requests
             meta = self._recving_metadata.pop(req_id, None)
@@ -1541,7 +1734,21 @@ def get_finished(self) -> tuple[set[str], set[str]]:
             if self.use_host_buffer:
                 self.sync_recved_kv_to_device(req_id, meta)
             if self.enable_permute_local_kv:
-                block_ids_to_permute += meta.local_block_ids
+                block_ids_to_permute += meta.local_physical_block_ids
+
+            # post processing for heteroblocksize
+            block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
+                meta.remote_engine_id
+            )
+            if (
+                not self.use_mla
+                and block_size_ratio > 1
+                and self.kv_cache_layout == "HND"
+            ):
+                block_ids_for_blocksize_post_process[block_size_ratio].append(
+                    meta.local_block_ids
+                )
+        self.blocksize_post_process(block_ids_for_blocksize_post_process)
         if len(block_ids_to_permute) > 0:
             self.permute_device_kv(block_ids_to_permute)
 
@@ -1628,7 +1835,7 @@ def _pop_done_transfers(
                         req_id,
                         xfer_state,
                     )
-                    # mark all blocks for this request as invalid
+                    # mark all (logical)blocks for this request as invalid
                     if meta := self._recving_metadata.pop(req_id, None):
                         self._invalid_block_ids.update(meta.local_block_ids)
                     self._recving_metadata.pop(req_id, None)
@@ -1645,13 +1852,19 @@ def start_load_kv(self, metadata: NixlConnectorMetadata):
         We check for these trnxs to complete in each step().
         """
         for req_id, meta in metadata.reqs_to_recv.items():
+            meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
+                meta.local_block_ids
+            )
+            meta.remote_block_ids = self._logical_to_kernel_block_ids(
+                meta.remote_block_ids
+            )
             remote_engine_id = meta.remote_engine_id
             logger.debug(
                 "start_load_kv for request %s from remote engine %s. "
                 "Num local_block_ids: %s. Num remote_block_ids: %s. ",
                 req_id,
                 remote_engine_id,
-                len(meta.local_block_ids),
+                len(meta.local_physical_block_ids),
                 len(meta.remote_block_ids),
             )
             # always store metadata for failure recovery
@@ -1699,7 +1912,7 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         self._read_blocks(
             request_id=req_id,
             dst_engine_id=meta.remote_engine_id,
-            local_block_ids=meta.local_block_ids,
+            local_block_ids=meta.local_physical_block_ids,
             remote_block_ids=meta.remote_block_ids,
         )
 
@@ -1710,6 +1923,24 @@ def _read_blocks(
         dst_engine_id: str,
         request_id: str,
     ):
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
+        if block_size_ratio > 1:
+            local_block_ids = self.get_mapped_blocks(
+                np.asarray(local_block_ids), block_size_ratio
+            )
+            if len(local_block_ids) > len(remote_block_ids):
+                # NOTE:
+                # get_mapped_blocks will always expand block_ids for n times.
+                # ex:
+                # prefill block_ids with block_size as 4:
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+                # Local decode block_ids with block_size as 16: [1, 2, 3]
+                # expland ecode block_ids with get_mapped_blocks from [1, 2, 3] to
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+                # Then we clip local to align with prefill
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+                local_block_ids = local_block_ids[: len(remote_block_ids)]
         # NOTE(rob): having the staging blocks be on the READER side is
         # not going to work well (since we will have to call rearrange tensors).
         # after we detect the txn is complete (which means we cannot make the
@@ -1752,7 +1983,10 @@ def _read_blocks(
             remote_block_ids = remote_block_ids[-num_local_blocks:]
 
         # Get side handles.
-        local_xfer_side_handle = self.src_xfer_side_handle
+        remote_block_size = self.kv_topo.remote_block_size[dst_engine_id]
+        local_xfer_side_handle = self.src_xfer_side_handles.get(
+            remote_block_size, self.src_xfer_side_handle
+        )
         remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
 
         # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
@@ -1762,13 +1996,17 @@ def _read_blocks(
         # Get descs ids.
         local_block_descs_ids: np.ndarray
         remote_block_descs_ids: np.ndarray
+
         if not self.block_window_per_layer:
             # Default case: assume global attention
             remote_block_descs_ids = self._get_block_descs_ids(
-                dst_engine_id, remote_block_ids
+                dst_engine_id,
+                remote_block_ids,
             )
             local_block_descs_ids = self._get_block_descs_ids(
-                self.engine_id, local_block_ids
+                self.engine_id,
+                local_block_ids,
+                block_size_ratio=block_size_ratio,
             )
         else:
             # TODO(mgoin): remove this once we have hybrid memory allocator
@@ -1789,10 +2027,15 @@ def _read_blocks(
 
                 # Get descs ids for the layer.
                 layer_local_desc_ids = self._get_block_descs_ids(
-                    self.engine_id, layer_local_block_ids, layer_idx
+                    dst_engine_id,
+                    layer_local_block_ids,
+                    layer_idx,
                 )
                 layer_remote_desc_ids = self._get_block_descs_ids(
-                    dst_engine_id, layer_remote_block_ids, layer_idx
+                    self.engine_id,
+                    layer_remote_block_ids,
+                    layer_idx,
+                    block_size_ratio=block_size_ratio,
                 )
 
                 local_descs_list.append(layer_local_desc_ids)
@@ -1826,7 +2069,7 @@ def _read_blocks(
                 "Marking blocks as invalid.",
                 request_id,
             )
-            # mark all blocks for this request as invalid
+            # mark all (logical) blocks for this request as invalid
             if meta := self._recving_metadata.get(request_id):
                 self._invalid_block_ids.update(meta.local_block_ids)
             self.xfer_stats.record_failed_transfer()
@@ -1834,8 +2077,31 @@ def _read_blocks(
                 self.nixl_wrapper.release_xfer_handle(handle)
             self._failed_recv_reqs.add(request_id)
 
+    def get_mapped_blocks(self, block_ids, block_size_ratio):
+        """
+          Calculates the new set of block IDs by mapping every element
+          in the (potentially sparse) input array.
+          Example: block_ids=[0, 2], block_size_ratio=2
+        get_mapped_blocks    0     1     [2     3]     4     5
+              # remote is |h0-b0|h1-b0||h0-b1|h1-b1||h0-b1|h1-b1||
+              # local is  |h0-b0......||h1-b0......||h2-b0........
+        local_block_ids         0           [1]           2
+        """
+        if block_ids.size == 0:
+            return np.array([], dtype=np.int64)
+
+        start_ids = block_ids * block_size_ratio
+        offsets = np.arange(block_size_ratio)
+        mapped_2d = start_ids[:, None] + offsets[None, :]
+
+        return mapped_2d.flatten().astype(np.int64)
+
     def _get_block_descs_ids(
-        self, engine_id: str, block_ids: list[int], layer_idx: int | None = None
+        self,
+        engine_id: str,
+        block_ids: list[int],
+        layer_idx: int | None = None,
+        block_size_ratio: float | None = None,
     ) -> np.ndarray:
         """
         Get the descs ids for a set of block ids.
@@ -1858,6 +2124,8 @@ def _get_block_descs_ids(
                 region_ids = np.arange(layer_idx, layer_idx + 1)
 
         num_blocks = self.dst_num_blocks[engine_id]
+        if block_size_ratio is not None:
+            num_blocks = int(num_blocks * block_size_ratio)
 
         # Compute the desc ids for each block.
         region_ids = region_ids[:, None]
@@ -1865,6 +2133,23 @@ def _get_block_descs_ids(
         descs_ids = region_ids * num_blocks + block_ids
         return descs_ids.flatten()
 
+    def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]:
+        """
+        Convert logical block ids to kernel physical block ids.
+        This is required when the logical block size (the one set by the user)
+        does not match the one required by the attn backend.
+        """
+        if self._physical_blocks_per_logical_kv_block == 1:
+            # Noop when physical and logical block sizes are the same
+            return block_ids
+        block_ids_np = np.array(block_ids)
+        block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape(
+            1, -1
+        )
+        return BlockTable.map_to_kernel_blocks(
+            block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange
+        ).tolist()
+
     def get_backend_aware_kv_block_len(self, layer_idx: int):
         """
         Get the block length for one K/V element (K and V have the same size).
@@ -1874,7 +2159,7 @@ def get_backend_aware_kv_block_len(self, layer_idx: int):
         For FlashInfer, this is half the length of the whole block, as K and V
         share the same region.
         """
-        if self._use_flashinfer:
+        if self.kv_topo.is_kv_layout_blocks_first:
             # For indexing only half (either just the K or V part).
             block_len = self.block_len_per_layer[layer_idx] // 2
         else:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a9b01e82562b..852c4c644433 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -385,6 +385,33 @@ def __init__(
             torch.ops._C, "init_shm_manager"
         )
 
+    def create_mq_broadcaster(
+        self, writer_rank=0, external_writer_handle=None, blocking=True
+    ):
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        return MessageQueue.create_from_process_group(
+            self.cpu_group,
+            1 << 22,
+            6,
+            writer_rank=writer_rank,
+            external_writer_handle=external_writer_handle,
+            blocking=blocking,
+        )
+
+    def create_single_reader_mq_broadcasters(
+        self, reader_rank_in_group=0, blocking=False
+    ):
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        return MessageQueue.create_from_process_group_single_reader(
+            self.cpu_group,
+            1 << 22,
+            6,
+            reader_rank=self.ranks[reader_rank_in_group],
+            blocking=blocking,
+        )
+
     @property
     def first_rank(self):
         """Return the global rank of the first process in the group"""
@@ -997,6 +1024,7 @@ def combine(
 
 
 _WORLD: GroupCoordinator | None = None
+_INNER_DP_WORLD: GroupCoordinator | None = None
 _NODE_COUNT: int | None = None
 
 
@@ -1005,6 +1033,11 @@ def get_world_group() -> GroupCoordinator:
     return _WORLD
 
 
+def get_inner_dp_world_group() -> GroupCoordinator:
+    assert _INNER_DP_WORLD is not None, "inner dp world group is not initialized"
+    return _INNER_DP_WORLD
+
+
 def init_world_group(
     ranks: list[int], local_rank: int, backend: str
 ) -> GroupCoordinator:
@@ -1023,12 +1056,13 @@ def init_model_parallel_group(
     backend: str,
     use_message_queue_broadcaster: bool = False,
     group_name: str | None = None,
+    use_device_communicator: bool = True,
 ) -> GroupCoordinator:
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_device_communicator=True,
+        use_device_communicator=use_device_communicator,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
         group_name=group_name,
     )
@@ -1143,7 +1177,14 @@ def init_distributed_environment(
     from vllm.config import get_current_vllm_config
 
     config = get_current_vllm_config()
-    if (
+    if config is not None and config.parallel_config.nnodes > 1:
+        parallel_config = config.parallel_config
+        ip = parallel_config.master_addr
+        rank = parallel_config.data_parallel_rank * world_size + rank
+        world_size = parallel_config.world_size_across_dp
+        port = parallel_config.master_port
+        distributed_init_method = get_distributed_init_method(ip, port)
+    elif (
         config is not None
         and config.parallel_config.data_parallel_size > 1
         and config.parallel_config.distributed_executor_backend != "external_launcher"
@@ -1164,6 +1205,14 @@ def init_distributed_environment(
             distributed_init_method,
         )
     if not torch.distributed.is_initialized():
+        logger.info(
+            "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",
+            world_size,
+            rank,
+            local_rank,
+            distributed_init_method,
+            backend,
+        )
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "
             "distributed environment"
@@ -1192,16 +1241,36 @@ def init_distributed_environment(
         # local rank not set, this usually happens in single-node
         # setting, where we can use rank as local rank
         local_rank = envs.LOCAL_RANK if distributed_init_method == "env://" else rank
-    global _WORLD, _NODE_COUNT
+    global _WORLD, _NODE_COUNT, _INNER_DP_WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
         _WORLD = init_world_group(ranks, local_rank, backend)
-        _NODE_COUNT = _node_count(_WORLD.cpu_group)
+        if config.parallel_config.nnodes > 1:
+            _NODE_COUNT = config.parallel_config.nnodes
+        else:
+            _NODE_COUNT = _node_count(_WORLD.cpu_group)
         logger.debug("Detected %d nodes in the distributed environment", _NODE_COUNT)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size"
         )
+    if config.parallel_config.nnodes_within_dp > 1:
+        if parallel_config.data_parallel_size > 1:
+            world_size_inner_dp = parallel_config.world_size
+            group_ranks = [
+                [dp_rank * world_size_inner_dp + i for i in range(world_size_inner_dp)]
+                for dp_rank in range(parallel_config.data_parallel_size)
+            ]
+            _INNER_DP_WORLD = init_model_parallel_group(
+                group_ranks,
+                get_world_group().local_rank,
+                backend,
+                use_message_queue_broadcaster=True,
+                group_name="inner_dp_world",
+                use_device_communicator=False,
+            )
+        else:
+            _INNER_DP_WORLD = _WORLD
 
 
 def initialize_model_parallel(
@@ -1483,6 +1552,9 @@ def destroy_distributed_environment():
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
+    # Ensure all objects are not freezed before cleanup
+    gc.unfreeze()
+
     destroy_model_parallel()
     destroy_distributed_environment()
     if shutdown_ray:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b12b7082af62..ab6e5e594c23 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -32,12 +32,13 @@
 from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CacheConfig,
     CompilationConfig,
     ConfigType,
     DeviceConfig,
+    ECTransferConfig,
     EPLBConfig,
     KVEventsConfig,
     KVTransferConfig,
@@ -383,6 +384,10 @@ class EngineArgs:
     ) = ParallelConfig.distributed_executor_backend
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
+    master_addr: str = ParallelConfig.master_addr
+    master_port: int = ParallelConfig.master_port
+    nnodes: int = ParallelConfig.nnodes
+    node_rank: int = ParallelConfig.node_rank
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
     decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
     dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
@@ -393,6 +398,7 @@ class EngineArgs:
     data_parallel_address: str | None = None
     data_parallel_rpc_port: int | None = None
     data_parallel_hybrid_lb: bool = False
+    data_parallel_external_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     all2all_backend: str | None = ParallelConfig.all2all_backend
@@ -427,11 +433,11 @@ class EngineArgs:
     cpu_offload_gb: float = CacheConfig.cpu_offload_gb
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
-    max_num_batched_tokens: int | None = SchedulerConfig.max_num_batched_tokens
+    max_num_batched_tokens: int | None = None
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
     max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
     long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
-    max_num_seqs: int | None = SchedulerConfig.max_num_seqs
+    max_num_seqs: int | None = None
     max_logprobs: int = ModelConfig.max_logprobs
     logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
     disable_log_stats: bool = False
@@ -462,7 +468,7 @@ class EngineArgs:
         MultiModalConfig.mm_shm_cache_max_object_size_mb
     )
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
-    mm_encoder_attn_backend: _Backend | str | None = (
+    mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
         MultiModalConfig.mm_encoder_attn_backend
     )
     io_processor_plugin: str | None = None
@@ -484,7 +490,7 @@ class EngineArgs:
     model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
     ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
 
-    enable_chunked_prefill: bool | None = SchedulerConfig.enable_chunked_prefill
+    enable_chunked_prefill: bool | None = None
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
     disable_hybrid_kv_cache_manager: bool = (
@@ -527,6 +533,8 @@ class EngineArgs:
     kv_transfer_config: KVTransferConfig | None = None
     kv_events_config: KVEventsConfig | None = None
 
+    ec_transfer_config: ECTransferConfig | None = None
+
     generation_config: str = ModelConfig.generation_config
     enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
     override_generation_config: dict[str, Any] = get_field(
@@ -555,12 +563,15 @@ class EngineArgs:
 
     async_scheduling: bool | None = SchedulerConfig.async_scheduling
 
+    stream_interval: int = SchedulerConfig.stream_interval
+
     kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
 
     kv_offloading_size: float | None = CacheConfig.kv_offloading_size
     kv_offloading_backend: KVOffloadingBackend | None = (
         CacheConfig.kv_offloading_backend
     )
+    tokens_only: bool = False
 
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
@@ -743,6 +754,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "-pp",
             **parallel_kwargs["pipeline_parallel_size"],
         )
+        parallel_group.add_argument("--master-addr", **parallel_kwargs["master_addr"])
+        parallel_group.add_argument("--master-port", **parallel_kwargs["master_port"])
+        parallel_group.add_argument("--nnodes", "-n", **parallel_kwargs["nnodes"])
+        parallel_group.add_argument("--node-rank", "-r", **parallel_kwargs["node_rank"])
         parallel_group.add_argument(
             "--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"]
         )
@@ -797,7 +812,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='Backend for data parallel, either "mp" or "ray".',
         )
         parallel_group.add_argument(
-            "--data-parallel-hybrid-lb", **parallel_kwargs["data_parallel_hybrid_lb"]
+            "--data-parallel-hybrid-lb",
+            "-dph",
+            **parallel_kwargs["data_parallel_hybrid_lb"],
+        )
+        parallel_group.add_argument(
+            "--data-parallel-external-lb",
+            "-dpe",
+            **parallel_kwargs["data_parallel_external_lb"],
         )
         parallel_group.add_argument(
             "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]
@@ -1024,10 +1046,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             description=SchedulerConfig.__doc__,
         )
         scheduler_group.add_argument(
-            "--max-num-batched-tokens", **scheduler_kwargs["max_num_batched_tokens"]
+            "--max-num-batched-tokens",
+            **{
+                **scheduler_kwargs["max_num_batched_tokens"],
+                "default": None,
+            },
         )
         scheduler_group.add_argument(
-            "--max-num-seqs", **scheduler_kwargs["max_num_seqs"]
+            "--max-num-seqs",
+            **{
+                **scheduler_kwargs["max_num_seqs"],
+                "default": None,
+            },
         )
         scheduler_group.add_argument(
             "--max-num-partial-prefills", **scheduler_kwargs["max_num_partial_prefills"]
@@ -1049,7 +1079,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--scheduling-policy", **scheduler_kwargs["policy"]
         )
         scheduler_group.add_argument(
-            "--enable-chunked-prefill", **scheduler_kwargs["enable_chunked_prefill"]
+            "--enable-chunked-prefill",
+            **{
+                **scheduler_kwargs["enable_chunked_prefill"],
+                "default": None,
+            },
         )
         scheduler_group.add_argument(
             "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"]
@@ -1064,6 +1098,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         scheduler_group.add_argument(
             "--async-scheduling", **scheduler_kwargs["async_scheduling"]
         )
+        scheduler_group.add_argument(
+            "--stream-interval", **scheduler_kwargs["stream_interval"]
+        )
 
         # Compilation arguments
         compilation_kwargs = get_kwargs(CompilationConfig)
@@ -1105,6 +1142,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--kv-transfer-config", **vllm_kwargs["kv_transfer_config"]
         )
         vllm_group.add_argument("--kv-events-config", **vllm_kwargs["kv_events_config"])
+        vllm_group.add_argument(
+            "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
+        )
         vllm_group.add_argument(
             "--compilation-config", "-O", **vllm_kwargs["compilation_config"]
         )
@@ -1416,12 +1456,56 @@ def create_engine_config(
         assert not headless or not self.data_parallel_hybrid_lb, (
             "data_parallel_hybrid_lb is not applicable in headless mode"
         )
-
-        data_parallel_external_lb = self.data_parallel_rank is not None
+        assert not (self.data_parallel_hybrid_lb and self.data_parallel_external_lb), (
+            "data_parallel_hybrid_lb and data_parallel_external_lb cannot both be True."
+        )
+        assert self.data_parallel_backend == "mp" or self.nnodes == 1, (
+            "nnodes > 1 is only supported with data_parallel_backend=mp"
+        )
+        inferred_data_parallel_rank = 0
+        if self.nnodes > 1:
+            world_size = (
+                self.data_parallel_size
+                * self.pipeline_parallel_size
+                * self.tensor_parallel_size
+            )
+            world_size_within_dp = (
+                self.pipeline_parallel_size * self.tensor_parallel_size
+            )
+            local_world_size = world_size // self.nnodes
+            assert world_size % self.nnodes == 0, (
+                f"world_size={world_size} must be divisible by nnodes={self.nnodes}."
+            )
+            assert self.node_rank < self.nnodes, (
+                f"node_rank={self.node_rank} must be less than nnodes={self.nnodes}."
+            )
+            inferred_data_parallel_rank = (
+                self.node_rank * local_world_size
+            ) // world_size_within_dp
+            if self.data_parallel_size > 1 and self.data_parallel_external_lb:
+                self.data_parallel_rank = inferred_data_parallel_rank
+                logger.info(
+                    "Inferred data_parallel_rank %d from node_rank %d for external lb",
+                    self.data_parallel_rank,
+                    self.node_rank,
+                )
+            elif self.data_parallel_size_local is None:
+                # Infer data parallel size local for internal dplb:
+                self.data_parallel_size_local = max(
+                    local_world_size // world_size_within_dp, 1
+                )
+        data_parallel_external_lb = (
+            self.data_parallel_external_lb or self.data_parallel_rank is not None
+        )
         # Local DP rank = 1, use pure-external LB.
         if data_parallel_external_lb:
+            assert self.data_parallel_rank is not None, (
+                "data_parallel_rank or node_rank must be spefified if "
+                "data_parallel_external_lb is enable."
+            )
             assert self.data_parallel_size_local in (1, None), (
-                "data_parallel_size_local must be 1 when data_parallel_rank is set"
+                "data_parallel_size_local must be 1 or None when data_parallel_rank "
+                "is set"
             )
             data_parallel_size_local = 1
             # Use full external lb if we have local_size of 1.
@@ -1435,6 +1519,11 @@ def create_engine_config(
 
             if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
                 # Use full external lb if we have local_size of 1.
+                logger.warning(
+                    "data_parallel_hybrid_lb is not eligible when "
+                    "data_parallel_size_local = 1, autoswitch to "
+                    "data_parallel_external_lb."
+                )
                 data_parallel_external_lb = True
                 self.data_parallel_hybrid_lb = False
 
@@ -1442,7 +1531,15 @@ def create_engine_config(
                 # Disable hybrid LB mode if set for a single node
                 self.data_parallel_hybrid_lb = False
 
-            self.data_parallel_rank = self.data_parallel_start_rank or 0
+            self.data_parallel_rank = (
+                self.data_parallel_start_rank or inferred_data_parallel_rank
+            )
+            if self.nnodes > 1:
+                logger.info(
+                    "Inferred data_parallel_rank %d from node_rank %d",
+                    self.data_parallel_rank,
+                    self.node_rank,
+                )
         else:
             assert not self.data_parallel_hybrid_lb, (
                 "data_parallel_size_local must be set to use data_parallel_hybrid_lb."
@@ -1472,7 +1569,9 @@ def create_engine_config(
                     "data_parallel_backend can only be ray or mp, got %s",
                     self.data_parallel_backend,
                 )
-                data_parallel_address = ParallelConfig.data_parallel_master_ip
+                data_parallel_address = (
+                    self.master_addr or ParallelConfig.data_parallel_master_ip
+                )
         else:
             data_parallel_address = self.data_parallel_address
 
@@ -1484,6 +1583,10 @@ def create_engine_config(
             else ParallelConfig.data_parallel_rpc_port
         )
 
+        if self.tokens_only and not model_config.skip_tokenizer_init:
+            model_config.skip_tokenizer_init = True
+            logger.info("Skipping tokenizer initialization for tokens-only mode.")
+
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
             self.eplb_config.num_redundant_experts = self.num_redundant_experts
@@ -1501,6 +1604,10 @@ def create_engine_config(
             data_parallel_rank=self.data_parallel_rank or 0,
             data_parallel_external_lb=data_parallel_external_lb,
             data_parallel_size_local=data_parallel_size_local,
+            master_addr=self.master_addr,
+            master_port=self.master_port,
+            nnodes=self.nnodes,
+            node_rank=self.node_rank,
             data_parallel_master_ip=data_parallel_address,
             data_parallel_rpc_port=data_parallel_rpc_port,
             data_parallel_backend=self.data_parallel_backend,
@@ -1556,6 +1663,7 @@ def create_engine_config(
             long_prefill_token_threshold=self.long_prefill_token_threshold,
             disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
             async_scheduling=self.async_scheduling,
+            stream_interval=self.stream_interval,
         )
 
         if not model_config.is_multimodal_model and self.default_mm_loras:
@@ -1625,40 +1733,39 @@ def create_engine_config(
             )
 
         observability_config = ObservabilityConfig(
-            show_hidden_metrics_for_version=(self.show_hidden_metrics_for_version),
+            show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
             collect_detailed_traces=self.collect_detailed_traces,
         )
 
         # Compilation config overrides
+        compilation_config = copy.deepcopy(self.compilation_config)
         if self.cuda_graph_sizes is not None:
             logger.warning(
                 "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or "
                 "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes "
                 "instead."
             )
-            if self.compilation_config.cudagraph_capture_sizes is not None:
+            if compilation_config.cudagraph_capture_sizes is not None:
                 raise ValueError(
                     "cuda_graph_sizes and compilation_config."
                     "cudagraph_capture_sizes are mutually exclusive"
                 )
-            self.compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
+            compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
         if self.cudagraph_capture_sizes is not None:
-            if self.compilation_config.cudagraph_capture_sizes is not None:
+            if compilation_config.cudagraph_capture_sizes is not None:
                 raise ValueError(
                     "cudagraph_capture_sizes and compilation_config."
                     "cudagraph_capture_sizes are mutually exclusive"
                 )
-            self.compilation_config.cudagraph_capture_sizes = (
-                self.cudagraph_capture_sizes
-            )
+            compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes
         if self.max_cudagraph_capture_size is not None:
-            if self.compilation_config.max_cudagraph_capture_size is not None:
+            if compilation_config.max_cudagraph_capture_size is not None:
                 raise ValueError(
                     "max_cudagraph_capture_size and compilation_config."
                     "max_cudagraph_capture_size are mutually exclusive"
                 )
-            self.compilation_config.max_cudagraph_capture_size = (
+            compilation_config.max_cudagraph_capture_size = (
                 self.max_cudagraph_capture_size
             )
 
@@ -1673,9 +1780,10 @@ def create_engine_config(
             load_config=load_config,
             structured_outputs_config=self.structured_outputs_config,
             observability_config=observability_config,
-            compilation_config=self.compilation_config,
+            compilation_config=compilation_config,
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
+            ec_transfer_config=self.ec_transfer_config,
             additional_config=self.additional_config,
         )
 
@@ -1726,44 +1834,41 @@ def _check_feature_supported(self, model_config: ModelConfig):
                 )
                 _raise_unsupported_error(feature_name=name)
 
-        if current_platform.is_cpu() and model_config.get_sliding_window() is not None:
-            _raise_unsupported_error(feature_name="sliding window (CPU backend)")
-
-    def _set_default_args(
-        self, usage_context: UsageContext, model_config: ModelConfig
-    ) -> None:
-        """Set Default Arguments for V1 Engine."""
-
-        # V1 uses chunked prefills and prefix caching by default
-        # for non-pooling tasks.
-        # For pooling tasks the default is False
+    @classmethod
+    def get_chunked_prefill_prefix_caching_defaults(
+        cls,
+        model_config: ModelConfig,
+    ) -> tuple[bool, bool]:
         if model_config.runner_type != "pooling":
-            self.enable_chunked_prefill = True
-
-            if self.enable_prefix_caching is None:
-                # Disable prefix caching default for hybrid models
-                # since the feature is still experimental.
-                if model_config.is_hybrid:
-                    self.enable_prefix_caching = False
-                else:
-                    self.enable_prefix_caching = True
+            default_chunked_prefill = True
+
+            # Disable prefix caching default for hybrid models
+            # since the feature is still experimental.
+            default_prefix_caching = not model_config.is_hybrid
         else:
+            assert model_config.pooler_config is not None
+
             pooling_type = model_config.pooler_config.pooling_type
-            is_causal = getattr(model_config.hf_config, "is_causal", True)
             incremental_prefill_supported = (
                 pooling_type is not None
                 and pooling_type.lower() == "last"
-                and bool(is_causal)
+                and getattr(model_config.hf_config, "is_causal", True)
             )
 
-            action = "Enabling" if incremental_prefill_supported else "Disabling"
+            default_chunked_prefill = incremental_prefill_supported
+            default_prefix_caching = incremental_prefill_supported
+
+        return default_chunked_prefill, default_prefix_caching
+
+    @classmethod
+    def get_batch_defaults(
+        cls,
+        world_size: int,
+    ) -> tuple[dict[UsageContext | None, int], dict[UsageContext | None, int]]:
+        from vllm.usage.usage_lib import UsageContext
 
-            if self.enable_chunked_prefill is None:
-                self.enable_chunked_prefill = incremental_prefill_supported
-                logger.info("(%s) chunked prefill by default", action)
-            if self.enable_prefix_caching is None:
-                self.enable_prefix_caching = incremental_prefill_supported
-                logger.info("(%s) prefix caching by default", action)
+        default_max_num_batched_tokens: dict[UsageContext | None, int]
+        default_max_num_seqs: dict[UsageContext | None, int]
 
         # When no user override, set the default values based on the usage
         # context.
@@ -1784,8 +1889,6 @@ def _set_default_args(
         # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
         # throughput, see PR #17885 for more details.
         # So here we do an extra device name check to prevent such regression.
-        from vllm.usage.usage_lib import UsageContext
-
         if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
             # For GPUs like H100 and MI300x, use larger default values.
             default_max_num_batched_tokens = {
@@ -1809,22 +1912,26 @@ def _set_default_args(
 
         # tpu specific default values.
         if current_platform.is_tpu():
-            default_max_num_batched_tokens_tpu = {
-                UsageContext.LLM_CLASS: {
-                    "V6E": 2048,
-                    "V5E": 1024,
-                    "V5P": 512,
-                },
-                UsageContext.OPENAI_API_SERVER: {
-                    "V6E": 1024,
-                    "V5E": 512,
-                    "V5P": 256,
-                },
-            }
+            chip_name = current_platform.get_device_name()
+
+            if chip_name == "V6E":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 2048,
+                    UsageContext.OPENAI_API_SERVER: 1024,
+                }
+            elif chip_name == "V5E":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 1024,
+                    UsageContext.OPENAI_API_SERVER: 512,
+                }
+            elif chip_name == "V5P":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 512,
+                    UsageContext.OPENAI_API_SERVER: 256,
+                }
 
         # cpu specific default values.
         if current_platform.is_cpu():
-            world_size = self.pipeline_parallel_size * self.tensor_parallel_size
             default_max_num_batched_tokens = {
                 UsageContext.LLM_CLASS: 4096 * world_size,
                 UsageContext.OPENAI_API_SERVER: 2048 * world_size,
@@ -1834,44 +1941,104 @@ def _set_default_args(
                 UsageContext.OPENAI_API_SERVER: 128 * world_size,
             }
 
-        use_context_value = usage_context.value if usage_context else None
-        if (
-            self.max_num_batched_tokens is None
-            and usage_context in default_max_num_batched_tokens
+        return default_max_num_batched_tokens, default_max_num_seqs
+
+    def _set_default_args(
+        self, usage_context: UsageContext, model_config: ModelConfig
+    ) -> None:
+        """Set Default Arguments for V1 Engine."""
+        (
+            default_chunked_prefill,
+            default_prefix_caching,
+        ) = self.get_chunked_prefill_prefix_caching_defaults(model_config)
+
+        if self.enable_chunked_prefill is None:
+            self.enable_chunked_prefill = default_chunked_prefill
+
+            logger.debug(
+                "%s chunked prefill by default",
+                "Enabling" if default_chunked_prefill else "Disabling",
+            )
+        elif (
+            model_config.runner_type == "pooling"
+            and self.enable_chunked_prefill
+            and not default_chunked_prefill
         ):
-            if current_platform.is_tpu():
-                chip_name = current_platform.get_device_name()
-                if chip_name in default_max_num_batched_tokens_tpu[usage_context]:
-                    self.max_num_batched_tokens = default_max_num_batched_tokens_tpu[
-                        usage_context
-                    ][chip_name]
-                else:
-                    self.max_num_batched_tokens = default_max_num_batched_tokens[
-                        usage_context
-                    ]
-            else:
-                if not self.enable_chunked_prefill:
-                    self.max_num_batched_tokens = model_config.max_model_len
-                else:
-                    self.max_num_batched_tokens = default_max_num_batched_tokens[
-                        usage_context
-                    ]
+            logger.warning(
+                "This model does not officially support chunked prefill. "
+                "Enabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+            )
+
+        if self.enable_prefix_caching is None:
+            self.enable_prefix_caching = default_prefix_caching
+
             logger.debug(
-                "Setting max_num_batched_tokens to %d for %s usage context.",
+                "%s prefix caching by default",
+                "Enabling" if default_prefix_caching else "Disabling",
+            )
+        elif (
+            model_config.runner_type == "pooling"
+            and self.enable_prefix_caching
+            and not default_prefix_caching
+        ):
+            logger.warning(
+                "This model does not officially support prefix caching. "
+                "Enabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+            )
+
+        world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+        (
+            default_max_num_batched_tokens,
+            default_max_num_seqs,
+        ) = self.get_batch_defaults(world_size)
+
+        orig_max_num_batched_tokens = self.max_num_batched_tokens
+        orig_max_num_seqs = self.max_num_seqs
+
+        if self.max_num_batched_tokens is None:
+            self.max_num_batched_tokens = default_max_num_batched_tokens.get(
+                usage_context,
+                SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
+            )
+
+        if self.max_num_seqs is None:
+            self.max_num_seqs = default_max_num_seqs.get(
+                usage_context,
+                SchedulerConfig.DEFAULT_MAX_NUM_SEQS,
+            )
+
+        if orig_max_num_batched_tokens is None:
+            if not self.enable_chunked_prefill:
+                # If max_model_len is too short, use the default for higher throughput.
+                self.max_num_batched_tokens = max(
+                    model_config.max_model_len,
+                    self.max_num_batched_tokens,
+                )
+
+            # When using default settings,
+            # Ensure max_num_batched_tokens does not exceed model limit.
+            # Some models (e.g., Whisper) have embeddings tied to max length.
+            self.max_num_batched_tokens = min(
+                self.max_num_seqs * model_config.max_model_len,
                 self.max_num_batched_tokens,
-                use_context_value,
             )
 
-        if self.max_num_seqs is None and usage_context in default_max_num_seqs:
-            self.max_num_seqs = min(
-                default_max_num_seqs[usage_context],
-                self.max_num_batched_tokens or sys.maxsize,
+            logger.debug(
+                "Defaulting max_num_batched_tokens to %d for %s usage context.",
+                self.max_num_batched_tokens,
+                usage_context.value if usage_context else None,
             )
 
+        if orig_max_num_seqs is None:
+            assert self.max_num_batched_tokens is not None  # For type checking
+            self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens)
+
             logger.debug(
-                "Setting max_num_seqs to %d for %s usage context.",
+                "Defaulting max_num_seqs to %d for %s usage context.",
                 self.max_num_seqs,
-                use_context_value,
+                usage_context.value if usage_context else None,
             )
 
 
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 24fcd9fe1cab..462d2c4e50e7 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -125,7 +125,7 @@ async def reset_mm_cache(self) -> None:
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self, device: Device | None = None) -> None:
+    async def reset_prefix_cache(self) -> None:
         """Reset the prefix cache"""
         ...
 
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index d7d6419d643b..3b722c2d9277 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -240,6 +240,9 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+
 
 ChatCompletionMessageParam: TypeAlias = (
     OpenAIChatCompletionMessageParam
@@ -265,6 +268,12 @@ class ConversationMessage(TypedDict, total=False):
     tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+
+    reasoning_content: str | None
+    """Deprecated: The reasoning content for interleaved thinking."""
+
 
 # Passed in by user
 ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
@@ -1374,7 +1383,7 @@ def _parse_chat_message_content(
 ) -> list[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
-
+    reasoning = message.get("reasoning") or message.get("reasoning_content")
     if content is None:
         content = []
     elif isinstance(content, str):
@@ -1396,6 +1405,12 @@ def _parse_chat_message_content(
             # follow the OpenAI spec.
             if "tool_calls" in parsed_msg and parsed_msg["tool_calls"] is not None:
                 result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
+            # Include reasoning if present for interleaved thinking.
+            if reasoning is not None:
+                result_msg["reasoning"] = cast(str, reasoning)
+                result_msg["reasoning_content"] = cast(
+                    str, reasoning
+                )  # keep compatibility
         elif role == "tool":
             parsed_msg = _ToolParser(message)
             if "tool_call_id" in parsed_msg:
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 2678658dd126..96608f360e17 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -24,6 +24,7 @@
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
 from vllm.v1.executor import Executor
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
 from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
 
@@ -97,18 +98,40 @@ def run_headless(args: argparse.Namespace):
     if local_engine_count <= 0:
         raise ValueError("data_parallel_size_local must be > 0 in headless mode")
 
-    host = parallel_config.data_parallel_master_ip
-    port = engine_args.data_parallel_rpc_port  # add to config too
-    handshake_address = get_tcp_uri(host, port)
+    shutdown_requested = False
 
     # Catch SIGTERM and SIGINT to allow graceful shutdown.
     def signal_handler(signum, frame):
+        nonlocal shutdown_requested
         logger.debug("Received %d signal.", signum)
-        raise SystemExit
+        if not shutdown_requested:
+            shutdown_requested = True
+            raise SystemExit
 
     signal.signal(signal.SIGTERM, signal_handler)
     signal.signal(signal.SIGINT, signal_handler)
 
+    if parallel_config.node_rank_within_dp > 0:
+        from vllm.version import __version__ as VLLM_VERSION
+
+        # Run headless workers (for multi-node PP/TP).
+        host = parallel_config.master_addr
+        head_node_address = f"{host}:{parallel_config.master_port}"
+        logger.info(
+            "Launching vLLM (v%s) headless multiproc executor, "
+            "with head node address %s for torch.distributed process group.",
+            VLLM_VERSION,
+            head_node_address,
+        )
+
+        executor = MultiprocExecutor(vllm_config, monitor_workers=False)
+        executor.start_worker_monitor(inline=True)
+        return
+
+    host = parallel_config.data_parallel_master_ip
+    port = parallel_config.data_parallel_rpc_port
+    handshake_address = get_tcp_uri(host, port)
+
     logger.info(
         "Launching %d data parallel engine(s) in headless mode, "
         "with head node address %s.",
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 0041db822080..7a41c668d764 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -80,7 +80,11 @@ def copy(self):
 
 class ConversationContext(ABC):
     @abstractmethod
-    def append_output(self, output) -> None:
+    def append_output(self, output: RequestOutput) -> None:
+        pass
+
+    @abstractmethod
+    def append_tool_output(self, output) -> None:
         pass
 
     @abstractmethod
@@ -151,6 +155,9 @@ def append_output(self, output) -> None:
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
 
+    def append_tool_output(self, output) -> None:
+        raise NotImplementedError("Should not be called.")
+
     def need_builtin_tool_call(self) -> bool:
         return False
 
@@ -205,28 +212,28 @@ def _update_num_reasoning_tokens(self):
         if self.parser.current_channel in {"analysis", "commentary"}:
             self.num_reasoning_tokens += 1
 
-    def append_output(self, output: RequestOutput | list[Message]) -> None:
-        if isinstance(output, RequestOutput):
-            output_token_ids = output.outputs[0].token_ids
-            self.parser = get_streamable_parser_for_assistant()
-            for token_id in output_token_ids:
-                self.parser.process(token_id)
-                # Check if the current token is part of reasoning content
-                self._update_num_reasoning_tokens()
-            self._update_prefill_token_usage(output)
-            self._update_decode_token_usage(output)
-            # Append current turn to all turn list for next turn's calculations
-            self.all_turn_metrics.append(self.current_turn_metrics.copy())
-            self.current_turn_metrics.reset()
-            # append_output is called only once before tool calling
-            # in non-streaming case
-            # so we can append all the parser messages to _messages
-            output_msgs = self.parser.messages
-            # The responses finish reason is set in the last message
-            self.finish_reason = output.outputs[0].finish_reason
-        else:
-            # Tool output.
-            output_msgs = output
+    def append_output(self, output: RequestOutput) -> None:
+        output_token_ids = output.outputs[0].token_ids
+        self.parser = get_streamable_parser_for_assistant()
+        for token_id in output_token_ids:
+            self.parser.process(token_id)
+            # Check if the current token is part of reasoning content
+            self._update_num_reasoning_tokens()
+        self._update_prefill_token_usage(output)
+        self._update_decode_token_usage(output)
+        # Append current turn to all turn list for next turn's calculations
+        self.all_turn_metrics.append(self.current_turn_metrics.copy())
+        self.current_turn_metrics.reset()
+        # append_output is called only once before tool calling
+        # in non-streaming case
+        # so we can append all the parser messages to _messages
+        output_msgs = self.parser.messages
+        # The responses finish reason is set in the last message
+        self.finish_reason = output.outputs[0].finish_reason
+        self._messages.extend(output_msgs)
+
+    def append_tool_output(self, output: list[Message]) -> None:
+        output_msgs = output
         self._messages.extend(output_msgs)
 
     def _update_prefill_token_usage(self, output: RequestOutput) -> None:
@@ -502,45 +509,45 @@ def __init__(self, *args, **kwargs):
     def messages(self) -> list:
         return self._messages
 
-    def append_output(self, output: RequestOutput | list[Message]) -> None:
-        if isinstance(output, RequestOutput):
-            # append_output is called for each output token in streaming case,
-            # so we only want to add the prompt tokens once for each message.
-            if self.first_tok_of_message:
-                self._update_prefill_token_usage(output)
-            # Reset self.first_tok_of_message if needed:
-            # if the current token is the last one of the current message
-            # (finished=True), then the next token processed will mark the
-            # beginning of a new message
-            self.first_tok_of_message = output.finished
-            for tok in output.outputs[0].token_ids:
-                self.parser.process(tok)
-            self._update_decode_token_usage(output)
-
-            # For streaming, update previous turn when message is complete
-            if output.finished:
-                self.all_turn_metrics.append(self.current_turn_metrics.copy())
-                self.current_turn_metrics.reset()
-            # Check if the current token is part of reasoning content
-            self._update_num_reasoning_tokens()
-            self.last_tok = tok
-            if len(self._messages) - self.num_init_messages < len(self.parser.messages):
-                self._messages.extend(
-                    self.parser.messages[len(self._messages) - self.num_init_messages :]
-                )
-        else:
-            # Handle the case of tool output in direct message format
-            assert len(output) == 1, "Tool output should be a single message"
-            msg = output[0]
-            # Sometimes the recipient is not set for tool messages,
-            # so we set it to "assistant"
-            if msg.author.role == Role.TOOL and msg.recipient is None:
-                msg.recipient = "assistant"
-            toks = self.encoding.render(msg)
-            for tok in toks:
-                self.parser.process(tok)
-            self.last_tok = toks[-1]
-            # TODO: add tool_output messages to self._messages
+    def append_output(self, output: RequestOutput) -> None:
+        # append_output is called for each output token in streaming case,
+        # so we only want to add the prompt tokens once for each message.
+        if self.first_tok_of_message:
+            self._update_prefill_token_usage(output)
+        # Reset self.first_tok_of_message if needed:
+        # if the current token is the last one of the current message
+        # (finished=True), then the next token processed will mark the
+        # beginning of a new message
+        self.first_tok_of_message = output.finished
+        for tok in output.outputs[0].token_ids:
+            self.parser.process(tok)
+        self._update_decode_token_usage(output)
+
+        # For streaming, update previous turn when message is complete
+        if output.finished:
+            self.all_turn_metrics.append(self.current_turn_metrics.copy())
+            self.current_turn_metrics.reset()
+        # Check if the current token is part of reasoning content
+        self._update_num_reasoning_tokens()
+        self.last_tok = tok
+        if len(self._messages) - self.num_init_messages < len(self.parser.messages):
+            self._messages.extend(
+                self.parser.messages[len(self._messages) - self.num_init_messages :]
+            )
+
+    def append_tool_output(self, output: list[Message]) -> None:
+        # Handle the case of tool output in direct message format
+        assert len(output) == 1, "Tool output should be a single message"
+        msg = output[0]
+        # Sometimes the recipient is not set for tool messages,
+        # so we set it to "assistant"
+        if msg.author.role == Role.TOOL and msg.recipient is None:
+            msg.recipient = "assistant"
+        toks = self.encoding.render(msg)
+        for tok in toks:
+            self.parser.process(tok)
+        self.last_tok = toks[-1]
+        # TODO: add tool_output messages to self._messages
 
     def is_expecting_start(self) -> bool:
         return self.parser.state == StreamState.EXPECT_START
diff --git a/vllm/entrypoints/dynamic_lora.py b/vllm/entrypoints/dynamic_lora.py
new file mode 100644
index 000000000000..cc0f437e5c77
--- /dev/null
+++ b/vllm/entrypoints/dynamic_lora.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+from fastapi import APIRouter, Depends, Request
+from fastapi.responses import JSONResponse, Response
+
+from vllm.entrypoints.openai.api_server import models, validate_json_request
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    LoadLoRAAdapterRequest,
+    UnloadLoRAAdapterRequest,
+)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def register_dynamic_lora_routes(router: APIRouter):
+    @sagemaker_standards.register_load_adapter_handler(
+        request_shape={
+            "lora_name": "body.name",
+            "lora_path": "body.src",
+        },
+    )
+    @router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)])
+    async def load_lora_adapter(request: LoadLoRAAdapterRequest, raw_request: Request):
+        handler: OpenAIServingModels = models(raw_request)
+        response = await handler.load_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(
+                content=response.model_dump(), status_code=response.error.code
+            )
+
+        return Response(status_code=200, content=response)
+
+    @sagemaker_standards.register_unload_adapter_handler(
+        request_shape={
+            "lora_name": "path_params.adapter_name",
+        }
+    )
+    @router.post(
+        "/v1/unload_lora_adapter", dependencies=[Depends(validate_json_request)]
+    )
+    async def unload_lora_adapter(
+        request: UnloadLoRAAdapterRequest, raw_request: Request
+    ):
+        handler: OpenAIServingModels = models(raw_request)
+        response = await handler.unload_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(
+                content=response.model_dump(), status_code=response.error.code
+            )
+
+        return Response(status_code=200, content=response)
+
+    return router
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 22fe2ae9280a..b0786bd355aa 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -23,6 +23,7 @@
     StructuredOutputsConfig,
     is_init_field,
 )
+from vllm.config.compilation import CompilationMode
 from vllm.config.model import (
     ConvertOption,
     HfOverrides,
@@ -31,7 +32,6 @@
     TokenizerMode,
 )
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.protocol import Device
 from vllm.entrypoints.chat_utils import (
     ChatCompletionMessageParam,
     ChatTemplateContentFormatOption,
@@ -259,7 +259,9 @@ def __init__(
 
         if compilation_config is not None:
             if isinstance(compilation_config, int):
-                compilation_config_instance = CompilationConfig(mode=compilation_config)
+                compilation_config_instance = CompilationConfig(
+                    mode=CompilationMode(compilation_config)
+                )
             elif isinstance(compilation_config, dict):
                 compilation_config_instance = CompilationConfig(
                     **{
@@ -1496,8 +1498,8 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self, device: Device | None = None) -> None:
-        self.llm_engine.reset_prefix_cache(device)
+    def reset_prefix_cache(self) -> None:
+        self.llm_engine.reset_prefix_cache()
 
     def sleep(self, level: int = 1):
         """
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index c8c8d5c034d5..3cf66fcd27e2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import asyncio
-import gc
 import hashlib
 import importlib
 import inspect
@@ -21,6 +19,7 @@
 from http import HTTPStatus
 from typing import Annotated, Any, Literal
 
+import model_hosting_container_standards.sagemaker as sagemaker_standards
 import prometheus_client
 import pydantic
 import regex as re
@@ -40,7 +39,7 @@
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.protocol import Device, EngineClient
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.anthropic.protocol import (
     AnthropicError,
     AnthropicErrorResponse,
@@ -66,8 +65,9 @@
     EmbeddingResponse,
     ErrorInfo,
     ErrorResponse,
+    GenerateRequest,
+    GenerateResponse,
     IOProcessorResponse,
-    LoadLoRAAdapterRequest,
     PoolingBytesResponse,
     PoolingRequest,
     PoolingResponse,
@@ -84,7 +84,6 @@
     TranscriptionResponse,
     TranslationRequest,
     TranslationResponse,
-    UnloadLoRAAdapterRequest,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_classification import ServingClassification
@@ -99,6 +98,7 @@
 from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
 from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization
+from vllm.entrypoints.openai.serving_tokens import ServingTokens
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription,
     OpenAIServingTranslation,
@@ -118,6 +118,7 @@
 from vllm.tasks import POOLING_TASKS
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.gc_utils import freeze_gc_heap
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.utils.system_utils import decorate_logs, set_ulimit
 from vllm.v1.engine.exceptions import EngineDeadError
@@ -153,8 +154,7 @@ async def _force_log():
 
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
-        gc.collect()
-        gc.freeze()
+        freeze_gc_heap()
         try:
             yield
         finally:
@@ -360,6 +360,10 @@ def engine_client(request: Request) -> EngineClient:
     return request.app.state.engine_client
 
 
+def generate_tokens(request: Request) -> ServingTokens | None:
+    return request.app.state.serving_tokens
+
+
 @router.get("/health", response_class=Response)
 async def health(raw_request: Request) -> Response:
     """Health check."""
@@ -389,13 +393,6 @@ async def get_server_load_metrics(request: Request):
     return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
 
 
-@router.get("/ping", response_class=Response)
-@router.post("/ping", response_class=Response)
-async def ping(raw_request: Request) -> Response:
-    """Ping check. Endpoint required for SageMaker"""
-    return await health(raw_request)
-
-
 @router.post(
     "/tokenize",
     dependencies=[Depends(validate_json_request)],
@@ -1072,12 +1069,8 @@ async def reset_prefix_cache(raw_request: Request):
         Reset the prefix cache. Note that we currently do not check if the
         prefix cache is successfully reset in the API server.
         """
-        device = None
-        device_str = raw_request.query_params.get("device")
-        if device_str is not None:
-            device = Device[device_str.upper()]
-        logger.info("Resetting prefix cache with specific %s...", str(device))
-        await engine_client(raw_request).reset_prefix_cache(device)
+        logger.info("Resetting prefix cache...")
+        await engine_client(raw_request).reset_prefix_cache()
         return Response(status_code=200)
 
     @router.post("/reset_mm_cache")
@@ -1239,44 +1232,38 @@ async def is_scaling_elastic_ep(raw_request: Request):
 
 
 @router.post(
-    "/invocations",
+    "/inference/v1/generate",
     dependencies=[Depends(validate_json_request)],
     responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
         HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
         HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
     },
 )
-async def invocations(raw_request: Request):
-    """For SageMaker, routes requests based on the request type."""
+@with_cancellation
+@load_aware_call
+async def generate(request: GenerateRequest, raw_request: Request):
+    handler = generate_tokens(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support generate tokens API"
+        )
     try:
-        body = await raw_request.json()
-    except json.JSONDecodeError as e:
+        generator = await handler.serve_tokens(request, raw_request)
+    except Exception as e:
         raise HTTPException(
-            status_code=HTTPStatus.BAD_REQUEST.value, detail=f"JSON decode error: {e}"
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
         ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
 
-    valid_endpoints = [
-        (validator, endpoint)
-        for validator, (get_handler, endpoint) in INVOCATION_VALIDATORS
-        if get_handler(raw_request) is not None
-    ]
-
-    for request_validator, endpoint in valid_endpoints:
-        try:
-            request = request_validator.validate_python(body)
-        except pydantic.ValidationError:
-            continue
-
-        return await endpoint(request, raw_request)
+    elif isinstance(generator, GenerateResponse):
+        return JSONResponse(content=generator.model_dump())
 
-    type_names = [
-        t.__name__ if isinstance(t := validator._type, type) else str(t)
-        for validator, _ in valid_endpoints
-    ]
-    msg = f"Cannot find suitable handler for request. Expected one of: {type_names}"
-    res = base(raw_request).create_error_response(message=msg)
-    return JSONResponse(content=res.model_dump(), status_code=res.error.code)
+    return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
 if envs.VLLM_TORCH_PROFILER_DIR:
@@ -1306,39 +1293,6 @@ async def stop_profile(raw_request: Request):
         return Response(status_code=200)
 
 
-if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
-    logger.warning(
-        "LoRA dynamic loading & unloading is enabled in the API server. "
-        "This should ONLY be used for local development!"
-    )
-
-    @router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)])
-    async def load_lora_adapter(request: LoadLoRAAdapterRequest, raw_request: Request):
-        handler = models(raw_request)
-        response = await handler.load_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(
-                content=response.model_dump(), status_code=response.error.code
-            )
-
-        return Response(status_code=200, content=response)
-
-    @router.post(
-        "/v1/unload_lora_adapter", dependencies=[Depends(validate_json_request)]
-    )
-    async def unload_lora_adapter(
-        request: UnloadLoRAAdapterRequest, raw_request: Request
-    ):
-        handler = models(raw_request)
-        response = await handler.unload_lora_adapter(request)
-        if isinstance(response, ErrorResponse):
-            return JSONResponse(
-                content=response.model_dump(), status_code=response.error.code
-            )
-
-        return Response(status_code=200, content=response)
-
-
 def load_log_config(log_config_file: str | None) -> dict | None:
     if not log_config_file:
         return None
@@ -1608,6 +1562,20 @@ def build_app(args: Namespace) -> FastAPI:
         )
     else:
         app = FastAPI(lifespan=lifespan)
+
+    if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+        logger.warning(
+            "LoRA dynamic loading & unloading is enabled in the API server. "
+            "This should ONLY be used for local development!"
+        )
+        from vllm.entrypoints.dynamic_lora import register_dynamic_lora_routes
+
+        register_dynamic_lora_routes(router)
+
+    from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
+
+    register_sagemaker_routes(router)
+
     app.include_router(router)
     app.root_path = args.root_path
 
@@ -1698,6 +1666,33 @@ async def log_response(request: Request, call_next):
                 f"Invalid middleware {middleware}. Must be a function or a class."
             )
 
+    app = sagemaker_standards.bootstrap(app)
+    # Optional endpoints
+    if args.tokens_only:
+
+        @app.post("/abort_requests")
+        async def abort_requests(raw_request: Request):
+            """
+            Abort one or more requests. To be used in a
+            Disaggregated Everything setup.
+            """
+            try:
+                body = await raw_request.json()
+            except json.JSONDecodeError as e:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail=f"JSON decode error: {e}",
+                ) from e
+            request_ids = body.get("request_ids")
+            if request_ids is None:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail="Missing 'request_ids' in request body",
+                )
+            # Abort requests in background
+            asyncio.create_task(engine_client(raw_request).abort(request_ids))
+            return Response(status_code=200)
+
     return app
 
 
@@ -1852,6 +1847,9 @@ async def init_app_state(
             engine_client,
             state.openai_serving_models,
             request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            trust_request_chat_template=args.trust_request_chat_template,
             log_error_stack=args.log_error_stack,
         )
         if "classify" in supported_tasks
@@ -1916,6 +1914,20 @@ async def init_app_state(
         if "generate" in supported_tasks
         else None
     )
+    state.serving_tokens = (
+        ServingTokens(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+            log_error_stack=args.log_error_stack,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_log_outputs=args.enable_log_outputs,
+            force_no_detokenize=args.tokens_only,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
     state.server_load_metrics = 0
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 476587c17823..946362ce2ef0 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -189,6 +189,11 @@ class FrontendArgs:
     Helps mitigate header abuse. Default: 256."""
     log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
     """If set to True, log the stack trace of error responses"""
+    tokens_only: bool = False
+    """
+    If set to True, only enable the Tokens In<>Out endpoint. 
+    This is intended for use in a Disaggregated Everything setup.
+    """
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 69e757d4764d..65bd15ba387b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2000,10 +2000,10 @@ class ScoreResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
-class ClassificationRequest(OpenAIBaseModel):
+class ClassificationCompletionRequest(OpenAIBaseModel):
     model: str | None = None
     input: list[str] | str
-    truncate_prompt_tokens: int | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
     user: str | None = None
 
     # --8<-- [start:classification-extra-params]
@@ -2015,7 +2015,21 @@ class ClassificationRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."
         ),
     )
-
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
     softmax: bool | None = Field(
         default=None,
         description="softmax will be deprecated, please use use_activation instead.",
@@ -2040,6 +2054,102 @@ def to_pooling_params(self):
         )
 
 
+class ClassificationChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:chat-classification-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:chat-classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+ClassificationRequest: TypeAlias = (
+    ClassificationCompletionRequest | ClassificationChatRequest
+)
+
+
 class ClassificationData(OpenAIBaseModel):
     index: int
     label: str | None
@@ -3110,3 +3220,80 @@ class TranslationResponseVerbose(OpenAIBaseModel):
 
     words: list[TranslationWord] | None = None
     """Extracted words and their corresponding timestamps."""
+
+
+####### Tokens IN <> Tokens OUT #######
+class GenerateRequest(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    token_ids: list[int]
+    """The token ids to generate text from."""
+
+    # features: MultiModalFeatureSpec
+    # TODO (NickLucche): implement once Renderer work is completed
+    features: str | None = None
+    """The processed MM inputs for the model."""
+
+    sampling_params: SamplingParams
+    """The sampling parameters for the model."""
+
+    model: str | None = None
+
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    cache_salt: str | None = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit)."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
+
+
+class GenerateResponseChoice(BaseModel):
+    index: int
+    logprobs: ChatCompletionLogProbs | None = None
+    # per OpenAI spec this is the default
+    finish_reason: str | None = "stop"
+    token_ids: list[int] | None = None
+
+
+class GenerateResponse(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    choices: list[GenerateResponseChoice]
+
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 45bbe732a680..167ee152fece 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -4,13 +4,17 @@
 from http import HTTPStatus
 from typing import cast
 
+import jinja2
 import numpy as np
 from fastapi import Request
-from typing_extensions import override
 
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
     ClassificationData,
     ClassificationRequest,
     ClassificationResponse,
@@ -32,7 +36,10 @@
 
 
 class ClassificationMixin(OpenAIServing):
-    @override
+    chat_template: str | None
+    chat_template_content_format: ChatTemplateContentFormatOption
+    trust_request_chat_template: bool
+
     async def _preprocess(
         self,
         ctx: ServeContext,
@@ -42,31 +49,79 @@ async def _preprocess(
         and prepare model-specific inputs.
         """
         ctx = cast(ClassificationServeContext, ctx)
-        if isinstance(ctx.request.input, str) and not ctx.request.input:
-            return self.create_error_response(
-                "Input cannot be empty for classification",
-                status_code=HTTPStatus.BAD_REQUEST,
-            )
-
-        if isinstance(ctx.request.input, list) and len(ctx.request.input) == 0:
-            return None
-
         try:
             ctx.tokenizer = await self.engine_client.get_tokenizer()
 
-            renderer = self._get_renderer(ctx.tokenizer)
-            ctx.engine_prompts = await renderer.render_prompt(
-                prompt_or_prompts=ctx.request.input,
-                config=self._build_render_config(ctx.request),
-            )
+            request_obj = ctx.request
+
+            if isinstance(request_obj, ClassificationChatRequest):
+                chat_request = request_obj
+                messages = chat_request.messages
+                trust_request_chat_template = getattr(
+                    self,
+                    "trust_request_chat_template",
+                    False,
+                )
+                ret = self._validate_chat_template(
+                    request_chat_template=chat_request.chat_template,
+                    chat_template_kwargs=chat_request.chat_template_kwargs,
+                    trust_request_chat_template=trust_request_chat_template,
+                )
+                if ret:
+                    return ret
+
+                (
+                    _,
+                    _,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    cast(ChatCompletionRequest, chat_request),
+                    ctx.tokenizer,
+                    messages,
+                    chat_template=(
+                        chat_request.chat_template
+                        or getattr(self, "chat_template", None)
+                    ),
+                    chat_template_content_format=cast(
+                        ChatTemplateContentFormatOption,
+                        getattr(self, "chat_template_content_format", "auto"),
+                    ),
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    add_special_tokens=chat_request.add_special_tokens,
+                )
+                ctx.engine_prompts = engine_prompts
+
+            elif isinstance(request_obj, ClassificationCompletionRequest):
+                completion_request = request_obj
+                input_data = completion_request.input
+                if input_data in (None, ""):
+                    return self.create_error_response(
+                        "Input or messages must be provided",
+                        status_code=HTTPStatus.BAD_REQUEST,
+                    )
+                if isinstance(input_data, list) and not input_data:
+                    ctx.engine_prompts = []
+                    return None
+
+                renderer = self._get_renderer(ctx.tokenizer)
+                prompt_input = cast(str | list[str], input_data)
+                ctx.engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=prompt_input,
+                    config=self._build_render_config(completion_request),
+                )
+            else:
+                return self.create_error_response(
+                    "Invalid classification request type",
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
 
             return None
 
-        except (ValueError, TypeError) as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-    @override
     def _build_response(
         self,
         ctx: ServeContext,
@@ -118,6 +173,7 @@ def _build_render_config(self, request: ClassificationRequest) -> RenderConfig:
         return RenderConfig(
             max_length=self.max_model_len,
             truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens,
         )
 
 
@@ -130,6 +186,9 @@ def __init__(
         models: OpenAIServingModels,
         *,
         request_logger: RequestLogger | None,
+        chat_template: str | None = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
+        trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
     ) -> None:
         super().__init__(
@@ -139,6 +198,10 @@ def __init__(
             log_error_stack=log_error_stack,
         )
 
+        self.chat_template = chat_template
+        self.chat_template_content_format = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template
+
     async def create_classify(
         self,
         request: ClassificationRequest,
@@ -156,7 +219,6 @@ async def create_classify(
 
         return await super().handle(ctx)  # type: ignore
 
-    @override
     def _create_pooling_params(
         self,
         ctx: ClassificationServeContext,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 8ce4ff574699..c50b0c4a23e1 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -43,6 +43,8 @@
     ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionResponse,
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
     ClassificationRequest,
     ClassificationResponse,
     CompletionRequest,
@@ -56,6 +58,8 @@
     ErrorResponse,
     FunctionCall,
     FunctionDefinition,
+    GenerateRequest,
+    GenerateResponse,
     IOProcessorRequest,
     PoolingResponse,
     RerankRequest,
@@ -114,13 +118,16 @@
     | DetokenizeRequest
     | EmbeddingCompletionRequest
     | RerankRequest
-    | ClassificationRequest
+    | ClassificationCompletionRequest
     | ScoreRequest
     | TokenizeCompletionRequest
 )
 
 ChatLikeRequest: TypeAlias = (
-    ChatCompletionRequest | EmbeddingChatRequest | TokenizeChatRequest
+    ChatCompletionRequest
+    | EmbeddingChatRequest
+    | TokenizeChatRequest
+    | ClassificationChatRequest
 )
 SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest
 AnyRequest: TypeAlias = (
@@ -129,6 +136,7 @@
     | SpeechToTextRequest
     | ResponsesRequest
     | IOProcessorRequest
+    | GenerateRequest
 )
 
 AnyResponse: TypeAlias = (
@@ -140,6 +148,7 @@
     | PoolingResponse
     | ClassificationResponse
     | ScoreResponse
+    | GenerateResponse
 )
 
 
@@ -814,7 +823,11 @@ def _get_message_types(self, request: AnyRequest) -> set[str]:
         if not hasattr(request, "messages"):
             return message_types
 
-        for message in request.messages:
+        messages = request.messages
+        if messages is None or isinstance(messages, (str, bytes)):
+            return message_types
+
+        for message in messages:
             if (
                 isinstance(message, dict)
                 and "content" in message
@@ -907,7 +920,8 @@ def _validate_input(
                 EmbeddingCompletionRequest,
                 ScoreRequest,
                 RerankRequest,
-                ClassificationRequest,
+                ClassificationCompletionRequest,
+                ClassificationChatRequest,
             ),
         ):
             # Note: input length can be up to the entire model context length
@@ -915,7 +929,8 @@ def _validate_input(
             if token_num > self.max_model_len:
                 operations: dict[type[AnyRequest], str] = {
                     ScoreRequest: "score",
-                    ClassificationRequest: "classification",
+                    ClassificationCompletionRequest: "classification",
+                    ClassificationChatRequest: "classification",
                 }
                 operation = operations.get(type(request), "embedding generation")
                 raise ValueError(
@@ -1227,7 +1242,7 @@ async def _generate_with_builtin_tools(
 
             # Call the tool and update the context with the result.
             tool_output = await context.call_tool()
-            context.append_output(tool_output)
+            context.append_tool_output(tool_output)
 
             # TODO: uncomment this and enable tool output streaming
             # yield context
@@ -1375,6 +1390,8 @@ def _parse_tool_calls_from_content(
                     for tool_call in tool_call_info.tool_calls
                 )
                 content = tool_call_info.content
+                if content and content.strip() == "":
+                    content = None
             else:
                 # No tool calls.
                 return None, content
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 0eade272111f..ee4c5c8bacaa 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -4,7 +4,7 @@
 import asyncio
 import json
 import time
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Sequence
 from typing import Final, cast
 
 import jinja2
@@ -122,6 +122,10 @@ async def create_pooling(
                 engine_prompts = await self.io_processor.pre_process_async(
                     prompt=validated_prompt, request_id=request_id
                 )
+                if not isinstance(engine_prompts, Sequence) or isinstance(
+                    engine_prompts, (str, bytes, bytearray)
+                ):
+                    engine_prompts = [engine_prompts]
 
             elif isinstance(request, PoolingChatRequest):
                 error_check_ret = self._validate_chat_template(
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 9b79e50c3208..06efb43ecb7b 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -48,7 +48,6 @@
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent,
 )
-from openai.types.responses.tool import Tool
 from openai_harmony import Message as OpenAIHarmonyMessage
 
 from vllm import envs
@@ -94,7 +93,11 @@
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.responses_utils import construct_chat_message_with_tool_call
+from vllm.entrypoints.responses_utils import (
+    construct_chat_message_with_tool_call,
+    convert_tool_responses_to_completions_format,
+    extract_tool_types,
+)
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
@@ -108,23 +111,6 @@
 logger = init_logger(__name__)
 
 
-def extract_tool_types(tools: list[Tool]) -> set[str]:
-    """
-    Extracts the tool types from the given tools.
-    """
-    tool_types: set[str] = set()
-    for tool in tools:
-        if tool.type == "mcp":
-            # Allow the MCP Tool type to enable built in tools if the
-            # server_label is allowlisted in
-            # envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS
-            if tool.server_label in envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS:
-                tool_types.add(tool.server_label)
-        else:
-            tool_types.add(tool.type)
-    return tool_types
-
-
 class OpenAIServingResponses(OpenAIServing):
     def __init__(
         self,
@@ -513,7 +499,10 @@ async def _make_request(
         ):
             tool_dicts = None
         else:
-            tool_dicts = [tool.model_dump() for tool in request.tools]
+            tool_dicts = [
+                convert_tool_responses_to_completions_format(tool.model_dump())
+                for tool in request.tools
+            ]
         # Construct the input messages.
         messages = self._construct_input_messages(request, prev_response)
         _, request_prompts, engine_prompts = await self._preprocess_chat(
diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/openai/serving_tokens.py
new file mode 100644
index 000000000000..69a526b9b70d
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_tokens.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import time
+from collections.abc import AsyncGenerator
+from collections.abc import Sequence as GenericSequence
+
+from fastapi import Request
+
+# yapf: disable
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionLogProb,
+    ChatCompletionLogProbs,
+    ChatCompletionLogProbsContent,
+    ErrorResponse,
+    GenerateRequest,
+    GenerateResponse,
+    GenerateResponseChoice,
+    PromptTokenUsageInfo,
+    RequestResponseMetadata,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.logger import init_logger
+from vllm.logprobs import Logprob
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.utils.collection_utils import as_list
+
+logger = init_logger(__name__)
+
+
+class ServingTokens(OpenAIServing):
+    """Provides Tokens IN <> Tokens OUT functionality to vLLM API."""
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        force_no_detokenize: bool = False,
+        return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
+        enable_prompt_tokens_details: bool = False,
+        enable_log_outputs: bool = False,
+    ):
+        super().__init__(engine_client=engine_client,
+                         models=models,
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         log_error_stack=log_error_stack)
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.enable_log_outputs = enable_log_outputs
+        self.force_no_detokenize = force_no_detokenize
+        if force_no_detokenize:
+            logger.info("Tokens-only mode is enabled, skipping detokenization "
+            "step for incoming requests.")
+
+    async def serve_tokens(
+        self,
+        request: GenerateRequest,
+        raw_request: Request | None = None
+    ) -> GenerateResponse | ErrorResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            logger.error("Error with model %s", error_check_ret)
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        lora_request = None
+        lora_request = self._maybe_get_adapters(request,
+            supports_default_mm_loras=True)
+
+        model_name = self.models.model_name(lora_request)
+
+        request_id = "generate-tokens-" \
+                     f"{self._base_request_id(raw_request, request.request_id)}"
+
+        request_metadata = RequestResponseMetadata(request_id=request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is
+        # completed
+        engine_prompt = EngineTokensPrompt(prompt_token_ids=request.token_ids)
+        if request.features is not None:
+            engine_prompt["multi_modal_data"] = None
+
+        if hasattr(request, "cache_salt") and request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
+        # Schedule the request and get the result generator.
+        result_generator: AsyncGenerator[RequestOutput, None] | None = None
+        try:
+            sampling_params = request.sampling_params
+            if self.force_no_detokenize:
+                sampling_params.detokenize = False
+
+            self._log_inputs(request_id,
+                             request.token_ids,
+                             params=sampling_params,
+                             lora_request=lora_request)
+
+            trace_headers = (None if raw_request is None else await
+                             self._get_trace_headers(raw_request.headers))
+
+            result_generator = self.engine_client.generate(
+                engine_prompt,
+                sampling_params,
+                request_id,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=request.priority,
+            )
+
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        # TODO(NickLucche): Implement streaming response
+
+        try:
+            assert result_generator is not None
+            return await self.serve_tokens_full_generator(
+                request, result_generator, request_id, model_name,
+                request_metadata)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+    async def serve_tokens_full_generator(
+        self,
+        request: GenerateRequest,
+        result_generator: AsyncGenerator[RequestOutput, None],
+        request_id: str,
+        model_name: str,
+        request_metadata: RequestResponseMetadata,
+    ) -> ErrorResponse | GenerateResponse:
+
+        created_time = int(time.time())
+        final_res: RequestOutput | None = None
+        sampling_params: SamplingParams = request.sampling_params
+
+        try:
+            async for res in result_generator:
+                final_res = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        assert final_res is not None
+
+        choices: list[GenerateResponseChoice] = []
+        num_generated_tokens = 0
+        for output in final_res.outputs:
+            token_ids = output.token_ids
+            out_logprobs = output.logprobs
+
+            # This is top_logprobs in completions API
+            if sampling_params.logprobs:
+                assert out_logprobs is not None, "Did not output logprobs"
+                logprobs = self._create_tokens_logprobs(
+                    token_ids=token_ids,
+                    top_logprobs=out_logprobs,
+                    num_output_top_logprobs=sampling_params.logprobs,
+                )
+            else:
+                logprobs = None
+
+            choice_data = GenerateResponseChoice(
+                index=output.index,
+                logprobs=logprobs,
+                finish_reason=output.finish_reason
+                if output.finish_reason else "stop",
+                token_ids=as_list(output.token_ids))
+
+            choices.append(choice_data)
+            num_generated_tokens += len(output.token_ids)
+
+        assert final_res.prompt_token_ids is not None
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        if final_res.encoder_prompt_token_ids is not None:
+            num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
+
+        usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                          completion_tokens=num_generated_tokens,
+                          total_tokens=num_prompt_tokens +
+                          num_generated_tokens)
+        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
+            # This info is not available at the /coordinator level
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=final_res.num_cached_tokens)
+
+        request_metadata.final_usage_info = usage
+
+        response = GenerateResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+            prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
+            kv_transfer_params=final_res.kv_transfer_params,
+        )
+
+        # Log complete response if output logging is enabled
+        if self.enable_log_outputs and self.request_logger:
+            for choice in choices:
+                # Get the corresponding output token IDs
+                output_token_ids = None
+                if choice.index < len(final_res.outputs):
+                    output_token_ids = final_res.outputs[
+                        choice.index].token_ids
+
+                if output_token_ids:
+                    # Log token_ids only.
+                    self.request_logger.log_outputs(
+                        request_id=request_id,
+                        outputs="", 
+                        output_token_ids=output_token_ids,
+                        finish_reason=choice.finish_reason,
+                        is_streaming=False,
+                        delta=False,
+                    )
+
+        return response
+
+    def _create_tokens_logprobs(
+        self,
+        token_ids: GenericSequence[int],
+        top_logprobs: GenericSequence[dict[int, Logprob] | None],
+        num_output_top_logprobs: int | None = None,
+    ) -> ChatCompletionLogProbs:
+        """Create OpenAI-style logprobs."""
+        logprobs_content: list[ChatCompletionLogProbsContent] = []
+
+        for i, token_id in enumerate(token_ids):
+            token = f"token_id:{token_id}"
+            step_top_logprobs = top_logprobs[i]
+            if step_top_logprobs is None or step_top_logprobs.get(
+                    token_id) is None:
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(token=token, ))
+            else:
+                step_token = step_top_logprobs[token_id]
+
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(
+                        token=token,
+                        logprob=max(step_token.logprob, -9999.0),
+                        top_logprobs=[
+                            ChatCompletionLogProb(
+                                token=token,
+                                logprob=max(p[1].logprob, -9999.0),
+                            ) for i, p in enumerate(step_top_logprobs.items())
+                            if num_output_top_logprobs
+                            and i < num_output_top_logprobs
+                        ]))
+
+        return ChatCompletionLogProbs(content=logprobs_content)
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index 0453db58361a..2b84c60a3b84 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -34,14 +34,34 @@ def __init__(self, tokenizer: AnyTokenizer):
             str
         ] = []  # map what has been streamed for each tool so far to a list
 
+        # Section-level state management to prevent token leakage
+        self.in_tool_section: bool = False
+        self.token_buffer: str = ""
+        # Buffer size: empirical worst-case for longest marker (~30 chars) * 2
+        # + safety margin for unicode + partial overlap. Prevents unbounded growth.
+        self.buffer_max_size: int = 1024
+        self.section_char_count: int = 0  # Track characters processed in tool section
+        self.max_section_chars: int = 8192  # Force exit if section exceeds this
+        self._buffer_overflow_logged: bool = False  # Log overflow once per session
+
+        # Support both singular and plural variants
         self.tool_calls_start_token: str = "<|tool_calls_section_begin|>"
         self.tool_calls_end_token: str = "<|tool_calls_section_end|>"
+        self.tool_calls_start_token_variants: list[str] = [
+            "<|tool_calls_section_begin|>",
+            "<|tool_call_section_begin|>",  # singular variant
+        ]
+        self.tool_calls_end_token_variants: list[str] = [
+            "<|tool_calls_section_end|>",
+            "<|tool_call_section_end|>",  # singular variant
+        ]
 
         self.tool_call_start_token: str = "<|tool_call_begin|>"
         self.tool_call_end_token: str = "<|tool_call_end|>"
 
         self.tool_call_regex = re.compile(
-            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*<\|tool_call_end\|>",
+            re.DOTALL,
         )
 
         self.stream_tool_call_portion_regex = re.compile(
@@ -58,6 +78,18 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token)
         self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token)
 
+        # Get token IDs for all variants
+        self.tool_calls_start_token_ids: list[int] = [
+            tid
+            for variant in self.tool_calls_start_token_variants
+            if (tid := self.vocab.get(variant)) is not None
+        ]
+        self.tool_calls_end_token_ids: list[int] = [
+            tid
+            for variant in self.tool_calls_end_token_variants
+            if (tid := self.vocab.get(variant)) is not None
+        ]
+
         self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
 
@@ -70,6 +102,51 @@ def __init__(self, tokenizer: AnyTokenizer):
                 "tokens in the tokenizer!"
             )
 
+    def _check_and_strip_markers(self, text: str) -> tuple[str, bool, bool]:
+        """
+        Check for section begin/end markers in text and strip them.
+        Returns: (cleaned_text, found_section_begin, found_section_end)
+        """
+        found_begin = False
+        found_end = False
+        cleaned = text
+
+        # Check for section begin markers (any variant)
+        for variant in self.tool_calls_start_token_variants:
+            if variant in cleaned:
+                cleaned = cleaned.replace(variant, "")
+                found_begin = True
+
+        # Check for section end markers (any variant)
+        for variant in self.tool_calls_end_token_variants:
+            if variant in cleaned:
+                cleaned = cleaned.replace(variant, "")
+                found_end = True
+
+        return cleaned, found_begin, found_end
+
+    def _reset_section_state(self) -> None:
+        """Reset state when exiting tool section."""
+        self.in_tool_section = False
+        self.token_buffer = ""
+        self.section_char_count = 0
+
+    def reset_streaming_state(self) -> None:
+        """
+        Reset all streaming state. Call this between requests to prevent
+        state leakage when parser instance is reused.
+        """
+        # Reset section state
+        self._reset_section_state()
+
+        # Reset parent class state
+        self.current_tool_name_sent = False
+        self.prev_tool_call_arr = []
+        self.current_tool_id = -1
+        self.streamed_args_for_tool = []
+
+        logger.debug("Streaming state reset")
+
     def extract_tool_calls(
         self,
         model_output: str,
@@ -131,13 +208,94 @@ def extract_tool_calls_streaming(
     ) -> DeltaMessage | None:
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
-        # check to see if we should be streaming a tool call - is there a
-        if self.tool_calls_start_token_id not in current_token_ids:
+
+        # Flag to defer section exit until after tool parsing completes
+        deferred_section_exit = False
+
+        # Add delta to buffer for split marker detection
+        self.token_buffer += delta_text
+
+        # Enforce buffer size limit to prevent memory issues
+        if len(self.token_buffer) > self.buffer_max_size:
+            if not self._buffer_overflow_logged:
+                logger.warning(
+                    "Token buffer exceeded max size (%d bytes), flushing excess. "
+                    "This may indicate very long markers or unusual tokenization.",
+                    self.buffer_max_size,
+                )
+                self._buffer_overflow_logged = True
+            # Keep only the most recent content that might contain partial markers
+            self.token_buffer = self.token_buffer[-self.buffer_max_size // 2 :]
+
+        # Check buffer for section markers (handles split tokens)
+        buffered_text, found_section_begin, found_section_end = (
+            self._check_and_strip_markers(self.token_buffer)
+        )
+
+        # Track section state transitions
+        if found_section_begin and not self.in_tool_section:
+            logger.debug("Entering tool section")
+            self.in_tool_section = True
+            self.token_buffer = buffered_text  # Use cleaned buffer
+            self.section_char_count = 0  # Reset counter for new section
+        if found_section_end and self.in_tool_section:
+            logger.debug("Detected section end marker")
+            # CRITICAL: Don't exit early if tool_call_end is in this chunk.
+            # Tool parser must emit final arguments/close first to avoid dropping
+            # the final tool update and leaking tokens into reasoning channel.
+            has_tool_end = self.tool_call_end_token_id in delta_token_ids
+            if has_tool_end:
+                # Defer exit until after tool parsing completes
+                deferred_section_exit = True
+                logger.debug("Deferring section exit: tool_call_end in same chunk")
+                self.token_buffer = buffered_text
+            else:
+                # No tool call ending, safe to exit immediately
+                logger.debug("Exiting tool section")
+                remaining = buffered_text
+                self._reset_section_state()
+                # Return remaining text as reasoning content if non-empty
+                if remaining.strip():
+                    return DeltaMessage(content=remaining)
+                # Return empty delta to maintain function contract
+                # (always returns DeltaMessage)
+                return DeltaMessage(content="")
+        else:
+            self.token_buffer = buffered_text
+
+        # Check if any variant of section start token is in current_token_ids
+        has_section_token = any(
+            tid in current_token_ids for tid in self.tool_calls_start_token_ids
+        )
+
+        # Early return: if no section token detected yet, return as reasoning content
+        if not has_section_token and not self.in_tool_section:
             logger.debug("No tool call tokens found!")
+            # Don't clear buffer - it needs to accumulate partial markers across deltas
+            # Buffer overflow is already protected by lines 215-224
             return DeltaMessage(content=delta_text)
-        delta_text = delta_text.replace(self.tool_calls_start_token, "").replace(
-            self.tool_calls_end_token, ""
-        )
+
+        # Strip section markers from delta_text for subsequent processing
+        # NOTE: This preprocessing happens BEFORE the regex-based tool call
+        # parsing (from PR #24847) to ensure markers are removed cleanly
+        # before pattern matching. No double-stripping occurs because
+        # section markers and tool call markers are distinct.
+        delta_text, _, _ = self._check_and_strip_markers(delta_text)
+
+        # Error recovery: If in tool section for too long, force exit
+        if self.in_tool_section:
+            self.section_char_count += len(delta_text)
+            if self.section_char_count > self.max_section_chars:
+                logger.warning(
+                    "Tool section exceeded max length (%d chars), forcing exit. "
+                    "This may indicate malformed model output.",
+                    self.max_section_chars,
+                )
+                self._reset_section_state()
+                # Deferred exit already handled by forced exit above
+                # Return remaining content as reasoning (or empty delta if no content)
+                return DeltaMessage(content=delta_text if delta_text.strip() else "")
+
         try:
             # figure out where we are in the parsing by counting tool call
             # start & end tags
@@ -158,6 +316,16 @@ def extract_tool_calls_streaming(
                 and prev_tool_end_count == cur_tool_end_count
                 and self.tool_call_end_token not in delta_text
             ):
+                # CRITICAL FIX: Suppress content if in tool section but
+                # no tool calls started
+                if self.in_tool_section and cur_tool_start_count == 0:
+                    logger.debug(
+                        "In tool section but no tool calls started yet. "
+                        "Suppressing: %s",
+                        delta_text,
+                    )
+                    # Return empty delta to maintain iterator contract
+                    return DeltaMessage(content="")
                 logger.debug("Generating text content! skipping tool parsing.")
                 return DeltaMessage(content=delta_text)
 
@@ -209,6 +377,9 @@ def extract_tool_calls_streaming(
             ):
                 if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0:
                     logger.debug("attempting to close tool call, but no tool call")
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit and self.in_tool_section:
+                        self._reset_section_state()
                     return None
                 diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments")
                 if diff:
@@ -218,6 +389,9 @@ def extract_tool_calls_streaming(
                         else diff
                     )
                     if '"}' not in delta_text:
+                        # Handle deferred section exit before returning
+                        if deferred_section_exit and self.in_tool_section:
+                            self._reset_section_state()
                         return None
                     end_loc = delta_text.rindex('"}')
                     diff = delta_text[:end_loc] + '"}'
@@ -227,6 +401,10 @@ def extract_tool_calls_streaming(
                         diff,
                     )
                     self.streamed_args_for_tool[self.current_tool_id] += diff
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit and self.in_tool_section:
+                        logger.debug("Completing deferred section exit")
+                        self._reset_section_state()
                     return DeltaMessage(
                         tool_calls=[
                             DeltaToolCall(
@@ -240,9 +418,19 @@ def extract_tool_calls_streaming(
 
             # case -- otherwise we're just generating text
             else:
+                # Check if we're in tool section - if so, suppress
+                if self.in_tool_section:
+                    logger.debug("In tool section, suppressing text generation")
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit:
+                        self._reset_section_state()
+                    return DeltaMessage(content="")
                 text = delta_text.replace(self.tool_call_start_token, "")
                 text = text.replace(self.tool_call_end_token, "")
                 delta = DeltaMessage(tool_calls=[], content=text)
+                # Handle deferred section exit before returning
+                if deferred_section_exit and self.in_tool_section:
+                    self._reset_section_state()
                 return delta
 
             current_tool_call = dict()
@@ -390,6 +578,11 @@ def extract_tool_calls_streaming(
             else:
                 self.prev_tool_call_arr.append(current_tool_call)
 
+            # Handle deferred section exit after tool parsing completes
+            if deferred_section_exit and self.in_tool_section:
+                logger.debug("Completing deferred section exit")
+                self._reset_section_state()
+
             return delta
 
         except Exception:
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 6eb7c0b70a67..d966f58804b6 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -10,7 +10,9 @@
     Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses.tool import Tool
 
+from vllm import envs
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionMessageParam,
     ResponseInputOutputItem,
@@ -43,3 +45,33 @@ def construct_chat_message_with_tool_call(
             tool_call_id=item.get("call_id"),
         )
     return item  # type: ignore
+
+
+def extract_tool_types(tools: list[Tool]) -> set[str]:
+    """
+    Extracts the tool types from the given tools.
+    """
+    tool_types: set[str] = set()
+    for tool in tools:
+        if tool.type == "mcp":
+            # Allow the MCP Tool type to enable built in tools if the
+            # server_label is allowlisted in
+            # envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS
+            if tool.server_label in envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS:
+                tool_types.add(tool.server_label)
+        else:
+            tool_types.add(tool.type)
+    return tool_types
+
+
+def convert_tool_responses_to_completions_format(tool: dict) -> dict:
+    """
+    Convert a flat tool schema:
+        {"type": "function", "name": "...", "description": "...", "parameters": {...}}
+    into:
+        {"type": "function", "function": {...}}
+    """
+    return {
+        "type": "function",
+        "function": tool,
+    }
diff --git a/vllm/entrypoints/sagemaker/__init__.py b/vllm/entrypoints/sagemaker/__init__.py
new file mode 100644
index 000000000000..c1767137e4ea
--- /dev/null
+++ b/vllm/entrypoints/sagemaker/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""SageMaker-specific integration for vLLM."""
diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py
new file mode 100644
index 000000000000..498b7294f0d8
--- /dev/null
+++ b/vllm/entrypoints/sagemaker/routes.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from http import HTTPStatus
+
+import model_hosting_container_standards.sagemaker as sagemaker_standards
+import pydantic
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, Response
+
+from vllm.entrypoints.openai.api_server import (
+    INVOCATION_VALIDATORS,
+    base,
+    health,
+    validate_json_request,
+)
+from vllm.entrypoints.openai.protocol import ErrorResponse
+
+
+def register_sagemaker_routes(router: APIRouter):
+    @router.post("/ping", response_class=Response)
+    @router.get("/ping", response_class=Response)
+    @sagemaker_standards.register_ping_handler
+    async def ping(raw_request: Request) -> Response:
+        """Ping check. Endpoint required for SageMaker"""
+        return await health(raw_request)
+
+    @router.post(
+        "/invocations",
+        dependencies=[Depends(validate_json_request)],
+        responses={
+            HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+            HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: {"model": ErrorResponse},
+            HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+        },
+    )
+    @sagemaker_standards.register_invocation_handler
+    @sagemaker_standards.stateful_session_manager()
+    @sagemaker_standards.inject_adapter_id(adapter_path="model")
+    async def invocations(raw_request: Request):
+        """For SageMaker, routes requests based on the request type."""
+        try:
+            body = await raw_request.json()
+        except json.JSONDecodeError as e:
+            raise HTTPException(
+                status_code=HTTPStatus.BAD_REQUEST.value,
+                detail=f"JSON decode error: {e}",
+            ) from e
+
+        valid_endpoints = [
+            (validator, endpoint)
+            for validator, (get_handler, endpoint) in INVOCATION_VALIDATORS
+            if get_handler(raw_request) is not None
+        ]
+
+        for request_validator, endpoint in valid_endpoints:
+            try:
+                request = request_validator.validate_python(body)
+            except pydantic.ValidationError:
+                continue
+
+            return await endpoint(request, raw_request)
+
+        type_names = [
+            t.__name__ if isinstance(t := validator._type, type) else str(t)
+            for validator, _ in valid_endpoints
+        ]
+        msg = f"Cannot find suitable handler for request. Expected one of: {type_names}"
+        res = base(raw_request).create_error_response(message=msg)
+        return JSONResponse(content=res.model_dump(), status_code=res.error.code)
+
+    return router
diff --git a/vllm/envs.py b/vllm/envs.py
index 078e5c38f0f4..6bf05803e14e 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -18,7 +18,7 @@
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
     VLLM_NCCL_SO_PATH: str | None = None
     LD_LIBRARY_PATH: str | None = None
-    VLLM_USE_TRITON_FLASH_ATTN: bool = True
+    VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE: int = 256
     VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
     VLLM_FLASH_ATTN_VERSION: int | None = None
     LOCAL_RANK: int = 0
@@ -92,6 +92,7 @@
     VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
     VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
     VLLM_USE_AOT_COMPILE: bool = False
+    VLLM_USE_BYTECODE_HOOK: bool = False
     VLLM_FORCE_AOT_LOAD: bool = False
     VLLM_TORCH_PROFILER_WITH_STACK: bool = True
     VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
@@ -100,7 +101,6 @@
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_DISABLE_PYNCCL: bool = False
-    VLLM_USE_V1: bool = True
     VLLM_ROCM_USE_AITER: bool = False
     VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
@@ -109,7 +109,7 @@
     VLLM_ROCM_USE_AITER_MLA: bool = True
     VLLM_ROCM_USE_AITER_MHA: bool = True
     VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False
-    VLLM_ROCM_USE_TRITON_ROPE: bool = False
+    VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = False
     VLLM_ROCM_USE_AITER_FP8BMM: bool = True
     VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
     VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True
@@ -147,6 +147,7 @@
     VLLM_TPU_MOST_MODEL_LEN: int | None = None
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = True
+    VLLM_MOE_USE_DEEP_GEMM: bool = True
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
     VLLM_DEEP_GEMM_WARMUP: Literal[
         "skip",
@@ -158,6 +159,7 @@
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
     VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency"
+    VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@@ -199,7 +201,7 @@
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
     VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
-    VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
+    VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
     VLLM_TUNED_CONFIG_FOLDER: str | None = None
     VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set()
     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
@@ -221,8 +223,9 @@
     VLLM_USE_FBGEMM: bool = False
     VLLM_GC_DEBUG: str = ""
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
+    VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
-    VLLM_FLATTEN_LOGPROBS: bool = False
+    VLLM_FLAT_LOGPROBS: bool = False
 
 
 def get_default_cache_root():
@@ -420,7 +423,7 @@ def get_vllm_port() -> int | None:
         raise ValueError(f"VLLM_PORT '{port}' must be a valid integer") from err
 
 
-# The begin-* and end* here are used by the documentation generator
+# The start-* and end* here are used by the documentation generator
 # to extract the used env vars.
 
 # --8<-- [start:env-vars-definition]
@@ -520,9 +523,9 @@ def get_vllm_port() -> int | None:
     # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
     # library file in the locations specified by `LD_LIBRARY_PATH`
     "LD_LIBRARY_PATH": lambda: os.environ.get("LD_LIBRARY_PATH", None),
-    # flag to control if vllm should use triton flash attention
-    "VLLM_USE_TRITON_FLASH_ATTN": lambda: (
-        os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")
+    # flag to control the chunk size (in MB) for sleeping memory allocations under ROCm
+    "VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE": lambda: int(
+        os.environ.get("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE", "256")
     ),
     # Use separate prefill and decode kernels for V1 attention instead of
     # the unified triton kernel.
@@ -554,6 +557,11 @@ def get_vllm_port() -> int | None:
     # compilation is done in warmup phase and the compilation will be
     # reused in subsequent calls.
     "VLLM_USE_AOT_COMPILE": use_aot_compile,
+    # Feature flag to enable/disable bytecode in
+    # TorchCompileWithNoGuardsWrapper.
+    "VLLM_USE_BYTECODE_HOOK": lambda: bool(
+        int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1"))
+    ),
     # Force vllm to always load AOT compiled models from disk. Failure
     # to load will result in a hard error when this is enabled.
     # Will be ignored when VLLM_USE_AOT_COMPILE is disabled.
@@ -625,14 +633,14 @@ def get_vllm_port() -> int | None:
     # - "FLASH_ATTN_MLA": use FlashAttention for MLA
     # - "FLASHINFER_MLA": use FlashInfer for MLA
     # - "CUTLASS_MLA": use CUTLASS for MLA
-    # All possible options loaded dynamically from _Backend enum
+    # All possible options loaded dynamically from AttentionBackendEnum
     "VLLM_ATTENTION_BACKEND": env_with_choices(
         "VLLM_ATTENTION_BACKEND",
         None,
         lambda: list(
             __import__(
-                "vllm.attention.backends.registry", fromlist=["_Backend"]
-            )._Backend.__members__.keys()
+                "vllm.attention.backends.registry", fromlist=["AttentionBackendEnum"]
+            ).AttentionBackendEnum.__members__.keys()
         ),
     ),
     # If set, vllm will use flashinfer sampler
@@ -828,9 +836,11 @@ def get_vllm_port() -> int | None:
     # Note that it must be an absolute path.
     "VLLM_TORCH_PROFILER_DIR": lambda: (
         None
-        if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None
-        else os.path.abspath(
-            os.path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))
+        if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None
+        else (
+            val
+            if val.startswith("gs://") and val[5:] and val[5] != "/"
+            else os.path.abspath(os.path.expanduser(val))
         )
     ),
     # Enable torch profiler to record shapes if set
@@ -882,8 +892,6 @@ def get_vllm_port() -> int | None:
     "VLLM_DISABLE_PYNCCL": lambda: (
         os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
     ),
-    # If set, use the V1 code path.
-    "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
     # Disable aiter ops unless specifically enabled.
     # Acts as a parent switch to enable the rest of the other operations.
     "VLLM_ROCM_USE_AITER": lambda: (
@@ -926,8 +934,8 @@ def get_vllm_port() -> int | None:
     ),
     # Whether to use aiter rope.
     # By default is disabled.
-    "VLLM_ROCM_USE_TRITON_ROPE": lambda: (
-        os.getenv("VLLM_ROCM_USE_TRITON_ROPE", "False").lower() in ("true", "1")
+    "VLLM_ROCM_USE_AITER_TRITON_ROPE": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_TRITON_ROPE", "False").lower() in ("true", "1")
     ),
     # Whether to use aiter triton fp8 bmm kernel
     # By default is enabled.
@@ -1116,6 +1124,10 @@ def get_vllm_port() -> int | None:
     ),
     # Allow use of DeepGemm kernels for fused moe ops.
     "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "1"))),
+    # Allow use of DeepGemm specifically for MoE fused ops (overrides only MoE).
+    "VLLM_MOE_USE_DEEP_GEMM": lambda: bool(
+        int(os.getenv("VLLM_MOE_USE_DEEP_GEMM", "1"))
+    ),
     # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
     "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
         int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
@@ -1232,6 +1244,10 @@ def get_vllm_port() -> int | None:
     "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
         "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"]
     ),
+    # Control the workspace buffer size for the FlashInfer backend.
+    "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int(
+        os.getenv("VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", str(394 * 1024 * 1024))
+    ),
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
     # the blockscale tensor of activations NVFP4 Quantization.
@@ -1379,7 +1395,7 @@ def get_vllm_port() -> int | None:
     ),
     # Whether to use pytorch symmetric memory for allreduce
     "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
-        int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))
+        int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
     ),
     # Allows vllm to find tuned config under customized folder
     "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
@@ -1467,6 +1483,13 @@ def get_vllm_port() -> int | None:
     "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool(
         int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0"))
     ),
+    # Limits when we run shared_experts in a separate stream.
+    # We found out that for large batch sizes, the separate stream
+    # execution is not beneficial (most likely because of the input clone)
+    # TODO(alexm-redhat): Tune to be more dynamic based on GPU type
+    "VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD": lambda: int(
+        int(os.getenv("VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD", 256))
+    ),
     # Format for saving torch.compile cache artifacts
     # - "binary": saves as binary file
     #     Safe for multiple vllm serve processes accessing the same torch compile cache.
@@ -1476,11 +1499,11 @@ def get_vllm_port() -> int | None:
     "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
         "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
     ),
-    # Flag to enable FlattenLogprobs whose GC overhead is significantly smaller than
+    # Flag to enable FlatLogprobs whose GC overhead is significantly smaller than
     # the original list[dict[int, Logprob]] approach.
     # After enabled, PromptLogprobs and SampleLogprobs would populated as
-    # FlattenLogprobs.
-    "VLLM_FLATTEN_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLATTEN_LOGPROBS", "0"))),
+    # FlatLogprobs.
+    "VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
@@ -1528,16 +1551,6 @@ def is_set(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
-def set_vllm_use_v1(use_v1: bool):
-    if is_set("VLLM_USE_V1"):
-        raise ValueError(
-            "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
-            "explicitly by the user. Please raise this as a Github "
-            "Issue and explicitly set VLLM_USE_V1=0 or 1."
-        )
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
-
-
 def compute_hash() -> str:
     """
     WARNING: Whenever a new key is added to this environment
@@ -1557,7 +1570,6 @@ def compute_hash() -> str:
         "VLLM_PP_LAYER_PARTITION",
         "VLLM_MLA_DISABLE",
         "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
-        "VLLM_USE_TRITON_FLASH_ATTN",
         "VLLM_USE_TRITON_AWQ",
         "VLLM_DP_RANK",
         "VLLM_DP_SIZE",
@@ -1569,6 +1581,7 @@ def compute_hash() -> str:
         "VLLM_USE_FLASHINFER_SAMPLER",
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
+        "VLLM_MOE_USE_DEEP_GEMM",
         "VLLM_USE_DEEP_GEMM_E8M0",
         "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP16",
@@ -1577,6 +1590,7 @@ def compute_hash() -> str:
         "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
+        "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE",
         "VLLM_USE_CUDNN_PREFILL",
         "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
         "VLLM_USE_TRTLLM_ATTENTION",
@@ -1589,7 +1603,7 @@ def compute_hash() -> str:
         "VLLM_ROCM_USE_AITER_MLA",
         "VLLM_ROCM_USE_AITER_MHA",
         "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM",
-        "VLLM_ROCM_USE_TRITON_ROPE",
+        "VLLM_ROCM_USE_AITER_TRITON_ROPE",
         "VLLM_ROCM_USE_AITER_FP8BMM",
         "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION",
         "VLLM_ROCM_USE_AITER_TRITON_GEMM",
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index ef37cf862c9f..25fb7181a8f2 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,7 +5,7 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple, Union
+from typing import TYPE_CHECKING, Any, NamedTuple
 
 import torch
 
@@ -185,18 +185,13 @@ class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
     no_compile_layers: dict[str, Any]
     """
-    Type AttentionMetadata for v0, 
     Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
     attention layer to its attention metadata
     Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: Union[
-        "AttentionMetadata",
-        dict[str, "AttentionMetadata"],
-        list[dict[str, "AttentionMetadata"]],
-    ]
+    attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
@@ -226,6 +221,10 @@ def get_forward_context() -> ForwardContext:
     return _forward_context
 
 
+def is_forward_context_available() -> bool:
+    return _forward_context is not None
+
+
 def create_forward_context(
     attn_metadata: Any,
     vllm_config: VllmConfig,
@@ -324,14 +323,7 @@ def set_forward_context(
     finally:
         global last_logging_time, batchsize_logging_interval
         if need_to_track_batchsize:
-            if hasattr(attn_metadata, "num_prefill_tokens"):
-                # for v0 attention backends
-                batchsize = (
-                    attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
-                )
-            else:
-                # for v1 attention backends
-                batchsize = num_tokens
+            batchsize = num_tokens
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 80d5322a34c3..839c13868a16 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -348,18 +348,15 @@ def _process_tokens(
         )
 
         inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_token_ids,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                 parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
         else:
-            if parsed_content.get("multi_modal_data"):
-                raise ValueError("This model does not support multimodal inputs")
-
             inputs = token_inputs(prompt_token_ids)
 
         if cache_salt := parsed_content.get("cache_salt"):
@@ -377,18 +374,15 @@ def _process_text(
         prompt_text = parsed_content["prompt"]
 
         inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_text,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                 parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
         else:
-            if parsed_content.get("multi_modal_data"):
-                raise ValueError("This model does not support multimodal inputs")
-
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
                 tokenization_kwargs=tokenization_kwargs,
diff --git a/vllm/logprobs.py b/vllm/logprobs.py
index bf66e5f75c79..a34398db2c96 100644
--- a/vllm/logprobs.py
+++ b/vllm/logprobs.py
@@ -30,16 +30,16 @@ class Logprob:
 
 
 @dataclass
-class FlattenLogprobs(MutableSequence[LogprobsOnePosition]):
+class FlatLogprobs(MutableSequence[LogprobsOnePosition]):
     """
-    Flatten logprobs of a request into multiple primitive type lists.
+    Flat logprobs of a request into multiple primitive type lists.
 
     Compared to list[dict[int, Logprob]], this data structure reduced GC
     overhead significantly. As it flattened logprob information for
     all positions and ranks in to multiple primitive type lists (i.e.
     logprobs, token_ids, ranks per token_ids, decoded_tokens).
     So regardless of the sequence length and top_logprobs setup,
-    FlattenLogprobs would only introduce a constant amount of objects.
+    FlatLogprobs would only introduce a constant amount of objects.
 
     As each position might contains different amount of ranks,
     start_indices_per_position would be used to access the logprob ranges
@@ -107,7 +107,7 @@ def __len__(self) -> int:
     def __getitem__(self, position: int) -> LogprobsOnePosition: ...
 
     @overload
-    def __getitem__(self, s: slice, /) -> "FlattenLogprobs": ...
+    def __getitem__(self, s: slice, /) -> "FlatLogprobs": ...
 
     def __getitem__(self, index: int | slice):
         """Extracts logprobs of a given position or slice"""
@@ -123,7 +123,7 @@ def __getitem__(self, index: int | slice):
         elif isinstance(index, slice):
             min_index = self.start_indices[index][0]
             max_index = self.end_indices[index][-1]
-            return FlattenLogprobs(
+            return FlatLogprobs(
                 # Shift updated start_indices and end_indices to
                 # be 0-indexed
                 start_indices=[i - min_index for i in self.start_indices[index]],
@@ -137,13 +137,13 @@ def __getitem__(self, index: int | slice):
             raise TypeError(f"Invalid index type: {type(index)}")
 
     def __setitem__(self, item, value) -> None:
-        raise TypeError("Cannot set logprobs in FlattenLogprobs")
+        raise TypeError("Cannot set logprobs in FlatLogprobs")
 
     def __delitem__(self, item) -> None:
-        raise TypeError("Cannot delete logprobs from FlattenLogprobs")
+        raise TypeError("Cannot delete logprobs from FlatLogprobs")
 
     def insert(self, item) -> None:
-        raise TypeError("Cannot insert logprobs to FlattenLogprobs")
+        raise TypeError("Cannot insert logprobs to FlatLogprobs")
 
     def __iter__(self) -> Iterator[LogprobsOnePosition]:
         """
@@ -156,14 +156,14 @@ def __iter__(self) -> Iterator[LogprobsOnePosition]:
 
 # {token_id -> logprob} per each sequence group. None if the corresponding
 # sequence group doesn't require prompt logprob.
-PromptLogprobs = FlattenLogprobs | list[LogprobsOnePosition | None]
+PromptLogprobs = FlatLogprobs | list[LogprobsOnePosition | None]
 # {token_id -> logprob} for each sequence group.
-SampleLogprobs = FlattenLogprobs | list[LogprobsOnePosition]
+SampleLogprobs = FlatLogprobs | list[LogprobsOnePosition]
 
 
 def create_prompt_logprobs() -> PromptLogprobs:
     """Creates a container to store prompt logprobs for a request"""
-    logprobs = FlattenLogprobs() if envs.VLLM_FLATTEN_LOGPROBS else []
+    logprobs = FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
     # NOTE: logprob of first prompt token is None.
     logprobs.append(None)
     return logprobs
@@ -171,7 +171,7 @@ def create_prompt_logprobs() -> PromptLogprobs:
 
 def create_sample_logprobs() -> SampleLogprobs:
     """Creates a container to store decode logprobs for a request"""
-    return FlattenLogprobs() if envs.VLLM_FLATTEN_LOGPROBS else []
+    return FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
 
 
 def append_logprobs_for_next_position(
@@ -191,7 +191,7 @@ def append_logprobs_for_next_position(
     topk_ranks = range(1, num_logprobs + 1)
     ranks = itertools.chain((rank,), topk_ranks)
 
-    if isinstance(request_logprobs, FlattenLogprobs):
+    if isinstance(request_logprobs, FlatLogprobs):
         request_logprobs.append_fast(token_ids, logprobs, ranks, decoded_tokens)
     else:
         request_logprobs.append(
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index d619a0edc124..3db4165e2017 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -121,7 +121,7 @@ def set_lora(
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
-        # In transformers backend, x and output have extra batch dimension like
+        # In Transformers modeling backend, x and output have extra batch dimension like
         # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
         # therefore we need to flatten the batch dimensions.
         if x.ndim == 3 and output.ndim == 3:
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index dadb9e25ba2f..8fb3efa220f6 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -25,7 +25,9 @@
     modular_triton_fused_moe,
     try_get_optimal_moe_config,
 )
-from vllm.model_executor.layers.fused_moe.layer import FusedMoEModularMethod
+from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
+    FusedMoEModularMethod,
+)
 
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 6d6de2529de3..e2dd47dbb4e6 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -26,7 +26,7 @@ def _get_ptr(lora_weights: list[torch.Tensor], device: torch.device):
     tensor_ptrs = []
     for lora_weight in lora_weights:
         tensor_ptrs.append(lora_weight.data_ptr())
-    ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+    ptr_tensor = torch.tensor(tensor_ptrs, device=device, dtype=torch.uint64)
 
     _LORA_PTR_DICT[key] = ptr_tensor
     return _LORA_PTR_DICT.get(key)
@@ -85,6 +85,7 @@ def _fused_moe_lora_kernel(
     GROUP_SIZE_M: tl.constexpr,
     SPLIT_K: tl.constexpr,
     USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
     IS_PRIMARY: tl.constexpr,
 ):
     pid = tl.program_id(axis=0)
@@ -153,7 +154,7 @@ def _fused_moe_lora_kernel(
         k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K)
         # pre-fetch lora weight
         b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
-        # GDC wait waits for ALL programs in the the prior kernel to complete
+        # GDC wait waits for ALL programs in the prior kernel to complete
         # before continuing.
         if USE_GDC and not IS_PRIMARY:
             tl.extra.cuda.gdc_wait()
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index 7f7d70cdc3a4..311c4b191859 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -48,6 +48,7 @@ def _lora_expand_kernel(
     SLICE_NUM: tl.constexpr,
     SAME_STRIDE: tl.constexpr,
     USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
 ):
     cta_n_num = tl.cdiv(N, BLOCK_N)
     cta_m_num = tl.cdiv(M, BLOCK_M)
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index e78379cf684a..71bd5e361466 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -46,6 +46,7 @@ def _lora_shrink_kernel(
     GROUP_SIZE_M: tl.constexpr,
     SLICE_NUM: tl.constexpr,
     USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
 ):
     cta_n_num = tl.cdiv(N, BLOCK_N)
     cta_m_num = tl.cdiv(M, BLOCK_M)
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 65babd10a948..7920d117de5e 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import contextlib
 import os
-from collections import namedtuple
 from collections.abc import Callable
+from functools import cache
 from typing import Any
 
 import torch
@@ -724,10 +723,6 @@ def linear_batch_invariant(input, weight, bias=None):
 _original_cublaslt_workspace_size = None
 
 
-def is_batch_invariant_mode_enabled():
-    return _batch_invariant_MODE
-
-
 def enable_batch_invariant_mode():
     global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
     global _original_fp16_reduction_precision, _original_bf16_reduction_precision
@@ -790,73 +785,7 @@ def enable_batch_invariant_mode():
     torch.backends.cuda.preferred_blas_library(backend="cublaslt")
 
 
-def disable_batch_invariant_mode():
-    global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
-    global _original_fp16_reduction_precision, _original_bf16_reduction_precision
-    global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size
-    if not _batch_invariant_MODE:
-        return
-
-    if _batch_invariant_LIB is not None:
-        _batch_invariant_LIB._destroy()
-    if _original_torch_bmm is not None:
-        torch.bmm = _original_torch_bmm
-        _original_torch_bmm = None
-
-    if _original_bf16_reduction_precision is not None:
-        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
-            _original_bf16_reduction_precision
-        )
-        _original_bf16_reduction_precision = None
-    if _original_fp16_reduction_precision is not None:
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
-            _original_fp16_reduction_precision
-        )
-        _original_fp16_reduction_precision = None
-
-    torch.backends.cuda.preferred_blas_library(backend="default")
-
-    if not is_torch_equal_or_newer("2.10.0.dev"):
-        # Set cublas env vars to previous results. If previous results are None,
-        # that means the env vars were not set, so we should remove them.
-        if _original_cublas_workspace_cfg:
-            os.environ["CUBLAS_WORKSPACE_CONFIG"] = _original_cublas_workspace_cfg
-        elif "CUBLAS_WORKSPACE_CONFIG" in os.environ:
-            del os.environ["CUBLAS_WORKSPACE_CONFIG"]
-
-        if _original_cublaslt_workspace_size:
-            os.environ["CUBLASLT_WORKSPACE_SIZE"] = _original_cublaslt_workspace_size
-        elif "CUBLASLT_WORKSPACE_SIZE" in os.environ:
-            del os.environ["CUBLASLT_WORKSPACE_SIZE"]
-
-    _original_cublas_workspace_cfg = None
-    _original_cublaslt_workspace_size = None
-
-    _batch_invariant_MODE = False
-    _batch_invariant_LIB = None
-
-
-@contextlib.contextmanager
-def set_batch_invariant_mode(enabled: bool = True):
-    global _batch_invariant_MODE, _batch_invariant_LIB
-    old_data = (_batch_invariant_MODE, _batch_invariant_LIB)
-    if enabled:
-        enable_batch_invariant_mode()
-    else:
-        disable_batch_invariant_mode()
-    yield
-    if _batch_invariant_LIB is not None:
-        _batch_invariant_LIB._destroy()
-    _batch_invariant_MODE, _batch_invariant_LIB = old_data
-
-
-AttentionBlockSize = namedtuple("AttentionBlockSize", ["block_m", "block_n"])
-
-
-def get_batch_invariant_attention_block_size() -> AttentionBlockSize:
-    return AttentionBlockSize(block_m=16, block_n=16)
-
-
+@cache
 def vllm_is_batch_invariant():
     env_key = "VLLM_BATCH_INVARIANT"
     is_overridden = False
diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py
new file mode 100644
index 000000000000..e6f2d2990c24
--- /dev/null
+++ b/vllm/model_executor/layers/conv.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Conv Layer Class."""
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vllm.model_executor.custom_op import CustomOp
+from vllm.utils.torch_utils import is_torch_equal
+
+
+class ConvLayerBase(CustomOp):
+    """Conv layer base class."""
+
+    num_dim: int
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int | tuple[int, ...],
+        stride: int | tuple[int, ...] = 1,
+        padding: int | tuple[int, ...] = 0,
+        dilation: int | tuple[int, ...] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        *,
+        params_dtype: torch.dtype | None = None,
+    ) -> None:
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        kernel_size = (
+            (kernel_size,) * self.num_dim
+            if isinstance(kernel_size, int)
+            else kernel_size
+        )
+        stride = (stride,) * self.num_dim if isinstance(stride, int) else stride
+        padding = (padding,) * self.num_dim if isinstance(padding, int) else padding
+        dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.padding_mode = padding_mode
+
+        self.enable_linear = (
+            (self.kernel_size == self.stride)
+            and not any(self.padding)
+            and self.groups == 1
+        )
+        self.input_size = in_channels * math.prod(self.kernel_size)
+
+        self.weight = nn.Parameter(
+            torch.empty(
+                out_channels,
+                in_channels // groups,
+                *kernel_size,
+                dtype=params_dtype,
+            ),
+        )
+
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.out_channels, dtype=params_dtype))
+        else:
+            self.register_parameter("bias", None)
+
+    def extra_repr(self) -> str:
+        s = f"in_channels={self.in_channels}, "
+        s += f"out_channels={self.out_channels}, "
+        s += f"kernel_size={self.kernel_size}, "
+        s += f"stride={self.stride}, "
+        s += f"padding={self.padding}, "
+        s += f"bias={self.bias is not None}"
+        return s
+
+
+@CustomOp.register("conv2d")
+class Conv2dLayer(ConvLayerBase):
+    """Conv layer with Conv2d."""
+
+    num_dim = 2
+
+    def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 4
+        B, C, H, W = x.shape
+        K1, K2 = self.kernel_size
+        H, W = H // K1, W // K2
+        x = x.unfold(2, K1, K1).unfold(3, K2, K2)
+        x = x.permute(0, 2, 3, 1, 4, 5).reshape(-1, self.input_size)
+        x = F.linear(
+            x,
+            self.weight.view(self.out_channels, self.input_size),
+            self.bias,
+        )
+        x = x.view(B, H, W, self.out_channels).permute(0, 3, 1, 2)
+        return x
+
+    def _forward_conv(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 4
+        x = F.conv2d(
+            x,
+            self.weight,
+            self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        return x
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, height, width)"""
+        assert x.dim() == 4
+        if self.enable_linear:
+            return self._forward_mulmat(x)
+        else:
+            return self._forward_conv(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # By default, we use CUDNN's convolution ops with optimization.
+        return self._forward_conv(x)
+
+
+class CausalConv2dLayer(Conv2dLayer):
+    """
+    A causal version of nn.Conv2d where each location in the 2D matrix would
+    have no access to locations on its right or down
+    All arguments are the same as nn.Conv2d except padding which should be
+    set as None
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        *,
+        params_dtype: torch.dtype | None = None,
+    ) -> None:
+        if padding is not None:
+            raise ValueError(
+                "Argument padding should be set to None for CausalConv2dLayer."
+            )
+        self._left_padding: int = kernel_size - 1
+        self._right_padding: int = stride - 1
+        padding = 0
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            params_dtype=params_dtype,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = F.pad(x, pad=(self._left_padding, self._right_padding, 0, 0))
+        x = super().forward(x)
+        return x
+
+
+@CustomOp.register("conv3d")
+class Conv3dLayer(ConvLayerBase):
+    """Conv layer with Conv3d."""
+
+    num_dim = 3
+
+    def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 5
+        B, C, T, H, W = x.shape
+        K1, K2, K3 = self.kernel_size
+        T, H, W = T // K1, H // K2, W // K3
+        x = x.unfold(2, K1, K1).unfold(3, K2, K2).unfold(4, K3, K3)
+        x = x.permute(0, 2, 3, 4, 1, 5, 6, 7).reshape(-1, self.input_size)
+        x = F.linear(
+            x,
+            self.weight.view(self.out_channels, self.input_size),
+            self.bias,
+        )
+        x = x.view(B, T, H, W, self.out_channels).permute(0, 4, 1, 2, 3)
+        return x
+
+    def _forward_conv(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 5
+        x = F.conv3d(
+            x,
+            self.weight,
+            self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        return x
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, time, height, width)"""
+        if self.enable_linear:
+            return self._forward_mulmat(x)
+        else:
+            return self._forward_conv(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch2.9.0 disabled CUDNN's Conv3D, which caused a
+        # significant performance regression.
+        # See: https://github.com/vllm-project/vllm/issues/27406
+        # and https://github.com/pytorch/pytorch/issues/166122
+        # By default, we use CUDNN's convolution ops with optimization.
+        if self.enable_linear and is_torch_equal("2.9.0"):
+            return self._forward_mulmat(x)
+        return self._forward_conv(x)
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index cb31045971bd..53d98d0650b4 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -5,9 +5,11 @@
 from typing import Any
 
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
-    FusedMoEMethodBase,
     FusedMoeWeightScaleSupported,
 )
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py
new file mode 100644
index 000000000000..2dd625054339
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from vllm.distributed import (
+    get_ep_group,
+)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.platforms import current_platform
+from vllm.utils.import_utils import has_deep_ep, has_pplx
+
+if current_platform.is_cuda_alike():
+    if has_pplx():
+        from .pplx_prepare_finalize import (
+            PplxPrepareAndFinalize,
+            pplx_hidden_dim_scale_bytes,
+        )
+    if has_deep_ep():
+        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
+        from .deepep_ll_prepare_finalize import (
+            DEEPEP_QUANT_BLOCK_SHAPE,
+            DeepEPLLPrepareAndFinalize,
+        )
+
+
+def maybe_roundup_layer_hidden_size(
+    hidden_size: int,
+    act_dtype: torch.dtype,
+    moe_parallel_config: FusedMoEParallelConfig,
+) -> int:
+    """
+    Given layer hidden size and MoE configurations, round up hidden_size
+    if necessary.
+
+    Args:
+        hidden_size: Layer hidden-size
+        act_dtype: Data type of the layer activations.
+        moe_parallel_config: Fused MoE parallelization strategy configuration.
+
+    Return:
+        Rounded up hidden_size if rounding up is required based on the configs
+        and all2all backend.
+        Original hidden size otherwise.
+    """
+    if moe_parallel_config.use_deepep_ht_kernels:
+        hidden_size = DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size(
+            hidden_size, act_dtype
+        )
+
+    if moe_parallel_config.use_deepep_ll_kernels:
+        hidden_size = DeepEPLLPrepareAndFinalize.maybe_roundup_layer_hidden_size(
+            hidden_size
+        )
+
+    return hidden_size
+
+
+def maybe_make_prepare_finalize(
+    moe: FusedMoEConfig,
+    quant_config: FusedMoEQuantConfig | None,
+) -> FusedMoEPrepareAndFinalize | None:
+    if not moe.moe_parallel_config.use_all2all_kernels:
+        return None
+
+    all2all_manager = get_ep_group().device_communicator.all2all_manager
+    assert all2all_manager is not None
+
+    prepare_finalize: FusedMoEPrepareAndFinalize | None = None
+
+    # TODO: could allow this now
+    assert not moe.use_flashinfer_cutlass_kernels, "Must be created in modelopt.py"
+
+    if moe.use_pplx_kernels:
+        assert quant_config is not None
+
+        hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
+            moe.max_num_tokens,
+            moe.hidden_dim,
+            moe.in_dtype,
+            quant_config.quant_dtype,
+            per_act_token_quant=quant_config.per_act_token_quant,
+            block_shape=quant_config.block_shape,
+        )
+
+        all_to_all_args = dict(
+            max_num_tokens=moe.max_num_tokens,
+            num_experts=moe.num_experts,
+            experts_per_token=moe.experts_per_token,  # topk
+            rank=all2all_manager.rank,
+            world_size=all2all_manager.world_size,
+            # dp_size actually means tp_size, bug in pplx kernels
+            dp_size=all2all_manager.tp_group.world_size,
+            hidden_dim=moe.hidden_dim,
+            hidden_dim_bytes=hidden_dim_bytes,
+            hidden_dim_scale_bytes=hidden_scale_bytes,
+        )
+
+        num_dispatchers = (
+            all2all_manager.world_size // all2all_manager.tp_group.world_size
+        )
+
+        # Intranode pplx a2a takes a group name while internode does not.
+        if not all2all_manager.internode:
+            all_to_all_args["group_name"] = all2all_manager.cpu_group.group_name
+
+        handle = all2all_manager.get_handle(all_to_all_args)
+
+        prepare_finalize = PplxPrepareAndFinalize(
+            handle,
+            max_num_tokens=moe.max_num_tokens,
+            num_local_experts=moe.num_local_experts,
+            num_dispatchers=num_dispatchers,
+        )
+    elif moe.use_deepep_ht_kernels:
+        assert moe.dp_size == all2all_manager.dp_world_size
+
+        all_to_all_args = dict()
+        handle = all2all_manager.get_handle(all_to_all_args)
+        prepare_finalize = DeepEPHTPrepareAndFinalize(
+            handle,
+            num_dispatchers=all2all_manager.world_size,
+            dp_size=all2all_manager.dp_world_size,
+            rank_expert_offset=all2all_manager.rank * moe.num_local_experts,
+        )
+
+    elif moe.use_deepep_ll_kernels:
+        assert quant_config is not None
+        all_to_all_args = dict(
+            max_num_tokens_per_dp_rank=moe.max_num_tokens,
+            token_hidden_size=moe.hidden_dim,
+            num_ep_ranks=all2all_manager.world_size,
+            num_global_experts=moe.num_experts,
+            num_local_experts=moe.num_experts // all2all_manager.world_size,
+        )
+        handle = all2all_manager.get_handle(all_to_all_args)
+
+        # Note: We may want to use FP8 dispatch just to reduce
+        # data movement.
+        use_fp8_dispatch = (
+            quant_config.quant_dtype == current_platform.fp8_dtype()
+            and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE
+        )
+
+        prepare_finalize = DeepEPLLPrepareAndFinalize(
+            handle,
+            max_tokens_per_rank=moe.max_num_tokens,
+            num_dispatchers=all2all_manager.world_size,
+            use_fp8_dispatch=use_fp8_dispatch,
+        )
+
+    return prepare_finalize
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 095ec966ea7e..53362277dae8 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
@@ -13,14 +15,33 @@
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     fp8_m_grouped_gemm_nt_masked,
     get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
 )
+from vllm.utils.math_utils import cdiv, round_up
 
 logger = init_logger(__name__)
 
 
+def scales_shape_stride_dtype(
+    E: int, T: int, G: int, quant_scale_fmt: DeepGemmQuantScaleFMT
+) -> tuple[tuple[int, ...], tuple[int, ...], torch.dtype]:
+    shape = (E, T, G)
+    strides = (T * G, 1, T)
+    if quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]:
+        return shape, strides, torch.float32
+
+    assert quant_scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+    shape = (E, T, cdiv(G, 4))
+    strides = (T * cdiv(G, 4), 1, T)
+    return shape, strides, torch.int32
+
+
 @triton.jit
 def _silu_mul_fp8_quant_deep_gemm(
     # Pointers ------------------------------------------------------------
@@ -49,7 +70,7 @@ def _silu_mul_fp8_quant_deep_gemm(
     eps: tl.constexpr,
     fp8_min: tl.constexpr,
     fp8_max: tl.constexpr,
-    use_ue8m0: tl.constexpr,
+    ceil_ue8m0: tl.constexpr,
     # Meta ---------------------------------------------------------------
     BLOCK: tl.constexpr,
     NUM_STAGES: tl.constexpr,
@@ -86,7 +107,7 @@ def _silu_mul_fp8_quant_deep_gemm(
         y = gate * up
 
         y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
-        if use_ue8m0:
+        if ceil_ue8m0:
             y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
 
         y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
@@ -100,6 +121,7 @@ def persistent_masked_m_silu_mul_quant(
     tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
     num_parallel_tokens=16,
     group_size: int = 128,
+    quant_scale_fmt: DeepGemmQuantScaleFMT = DeepGemmQuantScaleFMT.FLOAT32,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
     y has shape (E, T, 2*H). The first half of the last dimension is
@@ -136,7 +158,13 @@ def persistent_masked_m_silu_mul_quant(
 
     Returns `(y_q, y_s)` where
     * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
-    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+    * `y_s` depends on quant_scale_fmt,
+      - quant_scale_fmt == FLOAT32,
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0,
+         `y_s`: Int32 tensor, shape (E, T, H // group_size // 4), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0_FLOAT32_SPARSE
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
     Let NUM_WARPS be the number of warps in a single thread block and
     `GROUP_SIZE = 128` be the size of the quantization group.
     """
@@ -154,17 +182,18 @@ def persistent_masked_m_silu_mul_quant(
     fp8_dtype = torch.float8_e4m3fn
     y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
 
-    stride_ys_e = T * G
-    stride_ys_t = 1
-    stride_ys_g = T
+    ys_shape, ys_strides, ys_dtype = scales_shape_stride_dtype(E, T, G, quant_scale_fmt)
     y_s = torch.empty_strided(
-        (E, T, G),
-        (stride_ys_e, stride_ys_t, stride_ys_g),
-        dtype=torch.float32,
+        ys_shape,
+        ys_strides,
+        dtype=ys_dtype,
         device=y.device,
     )
 
-    use_ue8m0 = is_deep_gemm_e8m0_used()
+    ceil_ue8m0 = quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
 
     cuda_arch = current_platform.get_device_capability(
         device_id=y.device.index
@@ -172,7 +201,7 @@ def persistent_masked_m_silu_mul_quant(
 
     if cuda_arch >= 80:
         torch.ops._C.persistent_masked_m_silu_mul_quant(
-            y, tokens_per_expert, y_q, y_s, use_ue8m0
+            y, tokens_per_expert, y_q, y_s, ceil_ue8m0
         )
     else:
         stride_cnt_e = tokens_per_expert.stride()[0]
@@ -188,6 +217,10 @@ def persistent_masked_m_silu_mul_quant(
         fp8_max = f_info.max
         fp8_min = f_info.min
         eps: float = 1e-10
+        assert y_s.dtype == torch.float32, (
+            "_silu_mul_fp8_quant_deep_gemm does"
+            "not support {y_s.dtype} scales. Only torch.float32 supported."
+        )
         _silu_mul_fp8_quant_deep_gemm[grid](
             y,
             y_q,
@@ -201,14 +234,14 @@ def persistent_masked_m_silu_mul_quant(
             stride_yq_e,
             stride_yq_t,
             stride_yq_h,
-            stride_ys_e,
-            stride_ys_t,
-            stride_ys_g,
+            ys_strides[0],
+            ys_strides[1],
+            ys_strides[2],
             stride_cnt_e,
             eps,
             fp8_min,
             fp8_max,
-            is_deep_gemm_e8m0_used(),
+            ceil_ue8m0,
             BLOCK=group_size,
             NUM_STAGES=4,
             num_warps=1,
@@ -231,6 +264,7 @@ def __init__(
         """
         super().__init__(quant_config)
         assert self.block_shape == get_mk_alignment_for_contiguous_layout()
+        assert self.quant_config.use_fp8_w8a8
         self.max_num_tokens = max_num_tokens
         self.num_dispatchers = num_dispatchers
 
@@ -249,6 +283,12 @@ def supports_chunking(self) -> bool:
     def supports_expert_map(self) -> bool:
         return False
 
+    def supports_packed_ue8m0_act_scales(self) -> bool:
+        """
+        DeepGemm supports packed ue8m0 activation scales format in devices == sm100
+        """
+        return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100)
+
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Let PrepareAndFinalize::finalize() decide the impl.
         return TopKWeightAndReduceDelegate()
@@ -274,6 +314,33 @@ def workspace_shapes(
         output = (num_experts, max_num_tokens * num_dispatchers, K)
         return (workspace13, workspace2, output)
 
+    def estimate_expected_m(
+        self, global_num_experts: int, max_tokens_per_expert: int, topk: int
+    ) -> int:
+        dp_meta = (
+            get_forward_context().dp_metadata
+            if is_forward_context_available()
+            else None
+        )
+        if dp_meta is None:
+            logger.warning_once(
+                "DPMetadata unavailable. Defaulting expected_m to "
+                f"{max_tokens_per_expert}.",
+                scope="local",
+            )
+            return max_tokens_per_expert
+
+        total_num_tokens = dp_meta.num_tokens_across_dp_cpu.sum().item()
+        total_num_tokens_replicated = total_num_tokens * topk
+
+        # Assume even load balancing
+        assert global_num_experts != 0
+        estimate = round_up(int(total_num_tokens_replicated // global_num_experts), 16)
+        # clamp estimate
+        estimate = max(estimate, 16)
+        estimate = min(max_tokens_per_expert, estimate)
+        return estimate
+
     def apply(
         self,
         output: torch.Tensor,
@@ -309,10 +376,12 @@ def apply(
 
         workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))
 
-        # (from deepgemm docs) : A value hint (which is a value on CPU)
-        # for the M expectation of each batch, correctly setting this value
-        # may lead to better performance.
-        expected_m = max_num_tokens
+        expected_m = self.estimate_expected_m(
+            global_num_experts=global_num_experts,
+            max_tokens_per_expert=max_num_tokens,
+            topk=topk_ids.size(-1),
+        )
+
         fp8_m_grouped_gemm_nt_masked(
             (a1q, a1q_scale),
             (w1, self.w1_scale),
@@ -321,10 +390,17 @@ def apply(
             expected_m,
         )
 
+        quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
         a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
-            workspace1, expert_num_tokens
+            workspace1,
+            expert_num_tokens,
+            quant_scale_fmt=quant_scale_fmt,
         )
 
         fp8_m_grouped_gemm_nt_masked(
-            (a2q, a2q_scale), (w2, self.w2_scale), output, expert_num_tokens, expected_m
+            (a2q, a2q_scale),
+            (w2, self.w2_scale),
+            output,
+            expert_num_tokens,
+            expected_m,
         )
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index cbc3caafcf2f..a7bd64b1c65e 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
+from enum import IntEnum
 from typing import Optional, Union
 
 import torch
@@ -91,6 +92,26 @@ def _quant_flags_to_group_shape(
     return a_shape, w_shape
 
 
+# The type of method in top-K routing
+# Please keep this in sync with the counterpart defined in https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/trtllm/fused_moe/runner.h
+class RoutingMethodType(IntEnum):
+    # Default: Softmax -> TopK
+    Default = (0,)
+    # Renormalize: TopK -> Softmax
+    Renormalize = (1,)
+    # DeepSeekV3: Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups
+    # -> Top8 experts from the Top4 groups
+    DeepSeekV3 = (2,)
+    # Llama4: Top1 -> Sigmoid
+    Llama4 = (3,)
+    # RenormalizeNaive: Softmax -> TopK -> Renormalize
+    RenormalizeNaive = (4,)
+    # TopK: TopK (no softmax)
+    TopK = (5,)
+    # Unspecified
+    Unspecified = 6.0
+
+
 @dataclass
 class FusedMoEQuantDesc:
     """
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 000000000000..555d17364452
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 484b8aa9d107..86cdd25f2c87 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -215,7 +215,7 @@ def workspace_shapes(
         )
         assert M_sum % block_m == 0
 
-        workspace1 = (M_sum, max(N, K))
+        workspace1 = (M_sum, N)
         workspace2 = (M_sum, max(N // 2, K))
         output = (M, K)
         return (workspace1, workspace2, output)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 500bcefcfaa9..06c9df317f7c 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -6,6 +6,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
@@ -20,6 +21,8 @@
     dbo_maybe_run_recv_hook,
 )
 
+logger = init_logger(__name__)
+
 # DeepEP kernels quantize dispatch inputs in 128 element chunks.
 DEEPEP_QUANT_BLOCK_SIZE = 128
 DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
@@ -94,6 +97,29 @@ def __init__(
         self.handles: list[tuple | None] = [None, None]
         self.num_dispatchers_ = num_dispatchers
 
+        # We don't have enough information to determine if we should dispatch
+        # activation scales in a packed ue8m0 format during object construction
+        # time. This setting is handled by post_init_setup.
+        self.use_ue8m0_dispatch = False
+
+    def post_init_setup(self, fused_experts: mk.FusedMoEPermuteExpertsUnpermute):
+        if not fused_experts.supports_packed_ue8m0_act_scales():
+            # Early exit.
+            return
+
+        if self.use_fp8_dispatch:
+            logger.debug_once(
+                "Update DeepEPLLPrepareFinalize to do packed ue8m0 scales dispatch."
+            )
+            self.use_ue8m0_dispatch = True
+        else:
+            logger.warning_once(
+                "DeepEPLLPrepareAndFinalize is setup to dispatch raw/unquantized "
+                f"activations despite ({fused_experts.__class__.__name__}) being able "
+                "to support quantized activations.",
+                scope="local",
+            )
+
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
@@ -206,6 +232,9 @@ def prepare_async(
             self.max_tokens_per_rank,
             num_experts,
             use_fp8=self.use_fp8_dispatch,
+            # round_scale needs to be set to dispatch in ue8m0
+            round_scale=self.use_ue8m0_dispatch,
+            use_ue8m0=self.use_ue8m0_dispatch,
             async_finish=False,
             return_recv_hook=True,
         )
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 85ce77fb1f7f..f864634c6617 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -57,6 +57,7 @@ def __init__(
         tp_rank: int = 0,
         tp_size: int = 1,
         use_dp: bool = False,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
         super().__init__(quant_config)
         assert quant_config.quant_dtype in ("nvfp4", torch.float8_e4m3fn, None), (
@@ -69,6 +70,10 @@ def __init__(
         self.tp_size = tp_size
         self.out_dtype = out_dtype
         self.use_dp = use_dp
+        # Enables DeepSeek-style FP8 block-scale path:
+        # - pass per-block weight scales to the kernel
+        # - skip input activation quantization (kernel applies scaling)
+        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
 
     @property
     def activation_formats(
@@ -143,11 +148,22 @@ def apply(
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool | None,
     ):
-        assert activation == "silu", (
-            "Only activation silu is supported in FlashInferExperts"
+        from flashinfer.fused_moe.core import ActivationType
+
+        activation_str_to_value_map = {
+            "silu": ActivationType.Swiglu,  # This is the default
+            "relu2_no_mul": ActivationType.Relu2,
+        }
+        assert activation in activation_str_to_value_map, (
+            f"{activation=} missing from {activation_str_to_value_map.keys()=}"
         )
 
-        if self.quant_dtype == torch.float8_e4m3fn:
+        # Select quantization metadata based on FP8 format/path
+        if (
+            self.quant_dtype == torch.float8_e4m3fn
+            and not self.use_deepseek_fp8_block_scale
+        ):
+            # FP8 per-tensor path: use global alphas/scales; do not pass input_sf
             quant_scales = [
                 self.g1_alphas,
                 self.a2_gscale,
@@ -176,6 +192,15 @@ def apply(
             # FlashInfer API requires weight to be long for nvfp4
             fc1_expert_weights = w1.view(torch.long)
             fc2_expert_weights = w2.view(torch.long)
+        elif self.use_deepseek_fp8_block_scale:
+            # FP8 block-scale path: provide block-scale weights, omit a1q_scale
+            quant_scales = [
+                self.w1_scale,
+                self.w2_scale,
+            ]
+            a1q_scale = None
+            fc1_expert_weights = w1
+            fc2_expert_weights = w2
         else:
             quant_scales = None
             a1q_scale = None
@@ -196,6 +221,9 @@ def apply(
             ep_size=self.ep_size,
             ep_rank=self.ep_rank,
             output=output,
+            activation_type=activation_str_to_value_map[activation],
+            # Informs FlashInfer to use the block-scale decoding path when True
+            use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
         )
 
 
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index bc9aab5208d9..762890867e60 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -28,11 +28,15 @@ def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
         super().__init__()
         self.num_dispatchers_ = num_dispatchers
         self.use_dp = use_dp
         self.local_tokens = None
+        # Toggle for DeepSeek-style FP8 block-scale path where activations are
+        # not quantized here and weight block scales are consumed by the kernel.
+        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
@@ -73,8 +77,9 @@ def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
-        super().__init__(use_dp, num_dispatchers)
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
         self.alltoall_info = None
 
         # Initialize all2all_manager only for DP case
@@ -97,15 +102,19 @@ def prepare(
         )
 
         if not self.use_dp:
-            # Non-DP case: standard quantization
-            a1q, a1q_scale = moe_kernel_quantize_input(
-                a1,
-                quant_config.a1_gscale,
-                quant_config.quant_dtype,
-                quant_config.per_act_token_quant,
-                quant_config.block_shape,
-                is_fp4_scale_swizzled=not self.use_dp,
-            )
+            # Non-DP case: quantize activations unless using block-scale path
+            if not self.use_deepseek_fp8_block_scale:
+                a1q, a1q_scale = moe_kernel_quantize_input(
+                    a1,
+                    quant_config.a1_gscale,
+                    quant_config.quant_dtype,
+                    quant_config.per_act_token_quant,
+                    quant_config.block_shape,
+                    is_fp4_scale_swizzled=not self.use_dp,
+                )
+            else:
+                a1q = a1
+                a1q_scale = None
         else:
             # DP case: use FlashInfer AllToAll
             global_num_tokens_cpu = get_local_sizes()
@@ -122,6 +131,7 @@ def prepare(
                     top_k,
                     num_experts,
                     quant_config,
+                    use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
                 )
             )
 
@@ -154,8 +164,9 @@ def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
-        super().__init__(use_dp, num_dispatchers)
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
 
     def prepare(
         self,
@@ -173,22 +184,42 @@ def prepare(
         if not self.use_dp and quant_config.quant_dtype == "nvfp4":
             return a1, None, None, topk_ids, topk_weights
 
-        a1q, a1q_scale = moe_kernel_quantize_input(
-            a1,
-            quant_config.a1_gscale,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-            is_fp4_scale_swizzled=not self.use_dp,
-        )
+        if not self.use_deepseek_fp8_block_scale:
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                quant_config.a1_gscale,
+                quant_config.quant_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+                is_fp4_scale_swizzled=not self.use_dp,
+            )
+        else:
+            # Block-scale path: pass activations through, omit per-token scales
+            a1q = a1
+            a1q_scale = None
 
         if self.use_dp:
-            topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv(
-                [topk_weights, topk_ids, a1q, a1q_scale],
-                dim=0,
-                sizes=get_local_sizes(),
-            )
-        if quant_config.quant_dtype == "nvfp4":
+            # Build gather list conditionally - omit a1q_scale if None
+            # (block-scale path)
+            gather_list = [topk_weights, topk_ids, a1q]
+            if a1q_scale is not None:
+                gather_list.append(a1q_scale)
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
+                topk_weights, topk_ids, a1q, a1q_scale = gathered
+            else:
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
+                topk_weights, topk_ids, a1q = gathered
+                a1q_scale = None
+
+        if quant_config.quant_dtype == "nvfp4" and a1q_scale is not None:
             a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
 
         return a1q, a1q_scale, None, topk_ids, topk_weights
@@ -221,6 +252,7 @@ def flashinfer_alltoall_dispatch(
     top_k: int,
     num_experts: int,
     quant_config: FusedMoEQuantConfig,
+    use_deepseek_fp8_block_scale: bool = False,
 ):
     from flashinfer.comm.trtllm_alltoall import MnnvlMoe
 
@@ -250,30 +282,42 @@ def flashinfer_alltoall_dispatch(
     )
     topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype)
 
-    x, x_sf = moe_kernel_quantize_input(
-        x,
-        gs,
-        quant_config.quant_dtype,
-        quant_config.per_act_token_quant,
-        quant_config.block_shape,
-        is_fp4_scale_swizzled=False,  # delay swizzle to after comm
-    )
-    x = MnnvlMoe.mnnvl_moe_alltoallv(
-        x,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank,
-        ep_size,
-    )
+    if not use_deepseek_fp8_block_scale:
+        x, x_sf = moe_kernel_quantize_input(
+            x,
+            gs,
+            quant_config.quant_dtype,
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+            is_fp4_scale_swizzled=False,  # delay swizzle to after comm
+        )
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
 
-    x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
-        x_sf,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank,
-        ep_size,
-    )
-    x_sf = nvfp4_block_scale_interleave(x_sf)
+        x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
+            x_sf,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
+        if quant_config.quant_dtype == "nvfp4":
+            x_sf = nvfp4_block_scale_interleave(x_sf)
+    else:
+        # Block-scale path: pass activations through without quantization
+        x_sf = None
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
     return alltoall_info, topk_ids, topk_weights, x, x_sf
 
 
@@ -304,6 +348,7 @@ def create_flashinfer_prepare_finalize(
     use_dp: bool,
     use_nvfp4: bool = False,
     enable_alltoallv: bool = False,
+    use_deepseek_fp8_block_scale: bool = False,
 ) -> FlashInferCutlassMoEPrepareAndFinalize:
     """Factory function to create the appropriate FlashInfer implementation."""
     if use_nvfp4:
@@ -311,5 +356,7 @@ def create_flashinfer_prepare_finalize(
             return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
         else:
             return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
-    # Fp8 only supports AllGather
-    return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
+    # FP8 path currently supported via AllGather; optionally enable block-scale
+    return FlashInferAllGatherMoEPrepareAndFinalize(
+        use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+    )
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
index f21fe16c5108..51e06ac54f49 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     calculate_tile_tokens_dim,
@@ -23,26 +24,24 @@ def flashinfer_fused_moe_blockscale_fp8(
     w2_weight_scale_inv: torch.Tensor,
     global_num_experts: int,
     top_k: int,
-    num_expert_group: int,
-    topk_group: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
     intermediate_size: int,
     expert_offset: int,
     local_num_experts: int,
     block_shape: list[int],
-    routed_scaling: float = 1.0,
+    routing_method_type: int = RoutingMethodType.DeepSeekV3,
+    routed_scaling: float | None = 1.0,
 ) -> torch.Tensor:
     from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe
 
+    topk_group = topk_group if topk_group is not None else 0
     assert top_k <= global_num_experts
-    assert top_k <= 8
-    assert topk_group <= 4
-    assert global_num_experts > num_expert_group
-    assert global_num_experts % num_expert_group == 0
+    assert top_k <= 10
     assert global_num_experts % 4 == 0
-    assert top_k < (topk_group * global_num_experts / num_expert_group)
     assert block_shape == [128, 128]
-    # Routing kernel expects #experts <= #threads 256
-    assert global_num_experts <= 256
+    # Routing kernel expects #experts <= #threads 512
+    assert global_num_experts <= 512
 
     a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1])
     # NOTE: scales of hidden states have to be transposed!
@@ -64,10 +63,8 @@ def flashinfer_fused_moe_blockscale_fp8(
         local_expert_offset=expert_offset,
         local_num_experts=local_num_experts,
         routed_scaling_factor=routed_scaling,
-        tile_tokens_dim=calculate_tile_tokens_dim(
-            x.shape[0], top_k, global_num_experts
-        ),
-        routing_method_type=2,  # DeepSeek-styled routing method
+        tile_tokens_dim=None,
+        routing_method_type=routing_method_type,
         use_shuffled_weight=False,
     )
 
@@ -88,6 +85,7 @@ def flashinfer_fused_moe_blockscale_fp8_fake(
     expert_offset: int,
     local_num_experts: int,
     block_shape: list[int],
+    routing_method_type: int,
     routed_scaling: float = 1.0,
 ) -> torch.Tensor:
     return torch.empty_like(x)
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 3b0df6c416a0..0b0f59f67318 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -499,11 +499,35 @@ def batched_fused_marlin_moe(
 
 
 class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
-    def __init__(self, quant_config: FusedMoEQuantConfig):
+    def __init__(
+        self,
+        quant_config: FusedMoEQuantConfig,
+        w13_g_idx: torch.Tensor | None = None,
+        w2_g_idx: torch.Tensor | None = None,
+        w13_g_idx_sort_indices: torch.Tensor | None = None,
+        w2_g_idx_sort_indices: torch.Tensor | None = None,
+        is_k_full: bool = True,
+    ):
         # TODO (varun) : Enable activation quantization
-        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
+        assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
+            "Supports only mxfp4_w4a16 or int4_w4a16"
+        )
+        self.w13_g_idx = w13_g_idx
+        self.w2_g_idx = w2_g_idx
+        self.w13_g_idx_sort_indices = w13_g_idx_sort_indices
+        self.w2_g_idx_sort_indices = w2_g_idx_sort_indices
+        self.is_k_full = is_k_full
         super().__init__(quant_config)
 
+    @property
+    def quant_type_id(self) -> int:
+        # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4
+        return (
+            scalar_types.uint4b8.id
+            if self.quant_config.use_int4_w4a16
+            else scalar_types.float4_e2m1f.id
+        )
+
     def moe_problem_size(
         self,
         a1: torch.Tensor,
@@ -533,8 +557,23 @@ def moe_problem_size(
 
 
 class MarlinExperts(MarlinExpertsBase):
-    def __init__(self, quant_config: FusedMoEQuantConfig):
-        super().__init__(quant_config)
+    def __init__(
+        self,
+        quant_config: FusedMoEQuantConfig,
+        w13_g_idx: torch.Tensor | None = None,
+        w2_g_idx: torch.Tensor | None = None,
+        w13_g_idx_sort_indices: torch.Tensor | None = None,
+        w2_g_idx_sort_indices: torch.Tensor | None = None,
+        is_k_full: bool = True,
+    ):
+        super().__init__(
+            quant_config,
+            w13_g_idx,
+            w2_g_idx,
+            w13_g_idx_sort_indices,
+            w2_g_idx_sort_indices,
+            is_k_full,
+        )
 
     def supports_expert_map(self) -> bool:
         return True
@@ -616,7 +655,7 @@ def apply(
             gating_output=None,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
-            quant_type_id=scalar_types.float4_e2m1f.id,  # works only for w4a16
+            quant_type_id=self.quant_type_id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
             activation=activation,
@@ -628,6 +667,11 @@ def apply(
             # output buffer allocation. Please refer to workspace_shapes().
             intermediate_cache13=workspace2,
             intermediate_cache2=workspace13,
+            g_idx1=self.w13_g_idx,
+            g_idx2=self.w2_g_idx,
+            sort_indices1=self.w13_g_idx_sort_indices,
+            sort_indices2=self.w2_g_idx_sort_indices,
+            is_k_full=self.is_k_full,
         )
 
     def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
@@ -650,8 +694,20 @@ def __init__(
         max_num_tokens: int,
         num_dispatchers: int,
         quant_config: FusedMoEQuantConfig,
+        w13_g_idx: torch.Tensor | None = None,
+        w2_g_idx: torch.Tensor | None = None,
+        w13_g_idx_sort_indices: torch.Tensor | None = None,
+        w2_g_idx_sort_indices: torch.Tensor | None = None,
+        is_k_full: bool = True,
     ):
-        super().__init__(quant_config)
+        super().__init__(
+            quant_config,
+            w13_g_idx,
+            w2_g_idx,
+            w13_g_idx_sort_indices,
+            w2_g_idx_sort_indices,
+            is_k_full,
+        )
         self.max_num_tokens = max_num_tokens
         self.num_dispatchers = num_dispatchers
 
@@ -720,7 +776,7 @@ def apply(
             w1_scale=self.w1_scale,
             w2_scale=self.w2_scale,
             gating_output=None,
-            quant_type_id=scalar_types.float4_e2m1f.id,  # works only for w4a16
+            quant_type_id=self.quant_type_id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
             activation=activation,
@@ -728,4 +784,9 @@ def apply(
             output=output,
             intermediate_cache13=workspace13,
             intermediate_cache2=workspace2,
+            g_idx1=self.w13_g_idx,
+            g_idx2=self.w2_g_idx,
+            sort_indices1=self.w13_g_idx_sort_indices,
+            sort_indices2=self.w2_g_idx_sort_indices,
+            is_k_full=self.is_k_full,
         )
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 7ad3ce1397b3..2e042d85fcfc 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -14,6 +14,7 @@
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
@@ -55,8 +56,6 @@
 from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
 
-from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
-
 logger = init_logger(__name__)
 
 
@@ -1089,11 +1088,11 @@ def vllm_topk_softmax(
     return topk_weights, topk_indices
 
 
-def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
-    if is_rocm_aiter_moe_enabled():
-        from .rocm_aiter_fused_moe import rocm_aiter_topk_softmax
-
-        return rocm_aiter_topk_softmax
+def dispatch_topk_func(
+    use_rocm_aiter: bool = False,
+) -> Callable[..., tuple[torch.Tensor, ...]]:
+    if use_rocm_aiter:
+        return rocm_aiter_ops.topk_softmax
     return vllm_topk_softmax
 
 
@@ -1121,7 +1120,7 @@ def fused_topk(
         M, topk, dtype=torch.int32, device=hidden_states.device
     )
 
-    topk_func = dispatch_topk_func()
+    topk_func = dispatch_topk_func(use_rocm_aiter=rocm_aiter_ops.is_fused_moe_enabled())
     topk_weights, topk_ids = topk_func(
         topk_weights, topk_ids, token_expert_indices, gating_output, renormalize
     )
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
new file mode 100644
index 000000000000..87f8c8d75a9b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import abstractmethod
+from collections.abc import Callable
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizeMethodBase,
+)
+
+logger = init_logger(__name__)
+
+
+class FusedMoEMethodBase(QuantizeMethodBase):
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__()
+        self.moe: FusedMoEConfig = moe
+        self.moe_quant_config: FusedMoEQuantConfig | None = None
+
+    @abstractmethod
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def uses_weight_scale_2_pattern(self) -> bool:
+        """
+        Returns True if this quantization method uses 'weight_scale_2' pattern
+        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
+
+        This method should be overridden by subclasses that use the
+        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
+        """
+        return False
+
+    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+        from .all2all_utils import maybe_make_prepare_finalize
+
+        return maybe_make_prepare_finalize(self.moe, self.moe_quant_config)
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        # based on the all2all implementation, select the appropriate
+        # gemm implementation
+        raise NotImplementedError(
+            f"{self.__class__.__name__} must select appropriate gemm "
+            "implementation based on the prepare_finalize"
+        )
+
+    @abstractmethod
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        raise NotImplementedError
+
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return None
+
+    @property
+    def supports_eplb(self) -> bool:
+        return False
+
+    @property
+    def allow_inplace(self) -> bool:
+        return False
+
+    @abstractmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
new file mode 100644
index 000000000000..43974ba917e4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel,
+    FusedMoEPrepareAndFinalize,
+)
+
+logger = init_logger(__name__)
+
+
+@CustomOp.register("modular_fused_moe")
+class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
+    def __init__(
+        self, old_quant_method: FusedMoEMethodBase, experts: FusedMoEModularKernel
+    ):
+        super().__init__(old_quant_method.moe)
+        self.moe_quant_config = old_quant_method.moe_quant_config
+        self.fused_experts = experts
+        self.disable_expert_map = getattr(
+            old_quant_method,
+            "disable_expert_map",
+            not self.fused_experts.supports_expert_map(),
+        )
+        self.old_quant_method = old_quant_method
+        logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__)
+
+    @staticmethod
+    def make(
+        moe_layer: torch.nn.Module,
+        old_quant_method: FusedMoEMethodBase,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        shared_experts: torch.nn.Module | None,
+    ) -> "FusedMoEModularMethod":
+        return FusedMoEModularMethod(
+            old_quant_method,
+            FusedMoEModularKernel(
+                prepare_finalize,
+                old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
+                shared_experts,
+            ),
+        )
+
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return self.fused_experts.prepare_finalize.topk_indices_dtype()
+
+    @property
+    def supports_eplb(self) -> bool:
+        return self.old_quant_method.supports_eplb
+
+    @property
+    def allow_inplace(self) -> bool:
+        return self.old_quant_method.allow_inplace
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return self.moe_quant_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        # Is getattr needed?
+        zero_expert_num = getattr(layer, "zero_expert_num", 0)
+        zero_expert_type = getattr(layer, "zero_expert_type", None)
+
+        if enable_eplb:
+            if self.supports_eplb:
+                assert expert_load_view is not None
+                assert logical_to_physical_map is not None
+                assert logical_replica_count is not None
+            else:
+                raise NotImplementedError(
+                    "EPLB is not supported for "
+                    f"{self.old_quant_method.__class__.__name__}."
+                )
+
+        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            global_num_experts=global_num_experts,
+            zero_expert_num=zero_expert_num,
+            zero_expert_type=zero_expert_type,
+        )
+
+        result = self.fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=self.allow_inplace,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=None if self.disable_expert_map else expert_map,
+        )
+
+        if zero_expert_num != 0 and zero_expert_type is not None:
+            assert not isinstance(result, tuple), (
+                "Shared + zero experts are mutually exclusive not yet supported"
+            )
+            return result, zero_expert_result
+        else:
+            return result
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 2b8280f941e3..023132acfed3 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from abc import abstractmethod
 from collections.abc import Callable, Iterable
 from contextlib import nullcontext
 from enum import Enum
@@ -13,6 +12,7 @@
 from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.parallel import ExpertPlacementStrategy
 from vllm.distributed import (
@@ -26,56 +26,37 @@
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.fused_moe.config import (
-    FUSED_MOE_UNQUANTIZED_CONFIG,
     FusedMoEConfig,
     FusedMoEParallelConfig,
     FusedMoEQuantConfig,
-    biased_moe_quant_config,
+    RoutingMethodType,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEActivationFormat,
-    FusedMoEModularKernel,
     FusedMoEPermuteExpertsUnpermute,
     FusedMoEPrepareAndFinalize,
 )
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     init_aiter_topK_meta_data,
-    is_rocm_aiter_fusion_shared_expert_enabled,
-    is_rocm_aiter_moe_enabled,
 )
 from vllm.model_executor.layers.fused_moe.routing_simulator import RoutingSimulator
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
-    QuantizeMethodBase,
 )
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     is_flashinfer_supporting_global_sf,
 )
-from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.platforms.interface import CpuArchEnum
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
-from vllm.utils.import_utils import has_deep_ep, has_pplx
 from vllm.utils.math_utils import cdiv, round_up
-from vllm.utils.torch_utils import current_stream, direct_register_custom_op
+from vllm.utils.torch_utils import (
+    aux_stream,
+    current_stream,
+    direct_register_custom_op,
+)
 from vllm.v1.worker.ubatching import dbo_current_ubatch_id
 
 if current_platform.is_cuda_alike():
-    from .fused_batched_moe import BatchedTritonExperts
-    from .fused_moe import TritonExperts, eplb_map_to_physical_and_record, fused_experts
-
-    if has_pplx():
-        from .pplx_prepare_finalize import (
-            PplxPrepareAndFinalize,
-            pplx_hidden_dim_scale_bytes,
-        )
-    if has_deep_ep():
-        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
-        from .deepep_ll_prepare_finalize import (
-            DEEPEP_QUANT_BLOCK_SHAPE,
-            DeepEPLLPrepareAndFinalize,
-        )
+    from .fused_moe import eplb_map_to_physical_and_record, fused_experts
 else:
     fused_experts = None  # type: ignore
     FusedMoEPermuteExpertsUnpermute = object  # type: ignore
@@ -92,18 +73,26 @@ def _eplb_map_to_physical_and_record(
         return topk_ids
 
     eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
+from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
+    rocm_aiter_grouped_topk,
+)
 
-if is_rocm_aiter_moe_enabled():
-    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-        rocm_aiter_grouped_topk as grouped_topk_aiter,
-    )
-else:
-    from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
 if current_platform.is_tpu():
     from .moe_pallas import fused_moe as fused_moe_pallas
 else:
     fused_moe_pallas = None  # type: ignore
 
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
+    FusedMoEModularMethod,
+)
+from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
+    UnquantizedFusedMoEMethod,
+)
+
 logger = init_logger(__name__)
 
 
@@ -114,894 +103,13 @@ class FusedMoeWeightScaleSupported(Enum):
     BLOCK = "block"
 
 
-class FusedMoEMethodBase(QuantizeMethodBase):
-    def __init__(self, moe: FusedMoEConfig):
-        super().__init__()
-        self.moe: FusedMoEConfig = moe
-        self.moe_quant_config: FusedMoEQuantConfig | None = None
-
-    @abstractmethod
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        raise NotImplementedError
-
-    def uses_weight_scale_2_pattern(self) -> bool:
-        """
-        Returns True if this quantization method uses 'weight_scale_2' pattern
-        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
-
-        This method should be overridden by subclasses that use the
-        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
-        """
-        return False
-
-    @staticmethod
-    def _maybe_make_prepare_finalize(
-        moe: FusedMoEConfig,
-        quant_config: FusedMoEQuantConfig | None,
-    ) -> FusedMoEPrepareAndFinalize | None:
-        all2all_manager = get_ep_group().device_communicator.all2all_manager
-        assert all2all_manager is not None
-
-        prepare_finalize: FusedMoEPrepareAndFinalize | None = None
-
-        # TODO: could allow this now
-        assert not moe.use_flashinfer_cutlass_kernels, "Must be created in modelopt.py"
-
-        if moe.use_pplx_kernels:
-            assert quant_config is not None
-
-            hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
-                moe.max_num_tokens,
-                moe.hidden_dim,
-                moe.in_dtype,
-                quant_config.quant_dtype,
-                per_act_token_quant=quant_config.per_act_token_quant,
-                block_shape=quant_config.block_shape,
-            )
-
-            all_to_all_args = dict(
-                max_num_tokens=moe.max_num_tokens,
-                num_experts=moe.num_experts,
-                experts_per_token=moe.experts_per_token,  # topk
-                rank=all2all_manager.rank,
-                world_size=all2all_manager.world_size,
-                # dp_size actually means tp_size, bug in pplx kernels
-                dp_size=all2all_manager.tp_group.world_size,
-                hidden_dim=moe.hidden_dim,
-                hidden_dim_bytes=hidden_dim_bytes,
-                hidden_dim_scale_bytes=hidden_scale_bytes,
-            )
-
-            num_dispatchers = (
-                all2all_manager.world_size // all2all_manager.tp_group.world_size
-            )
-
-            # Intranode pplx a2a takes a group name while internode does not.
-            if not all2all_manager.internode:
-                all_to_all_args["group_name"] = all2all_manager.cpu_group.group_name
-
-            handle = all2all_manager.get_handle(all_to_all_args)
-
-            prepare_finalize = PplxPrepareAndFinalize(
-                handle,
-                max_num_tokens=moe.max_num_tokens,
-                num_local_experts=moe.num_local_experts,
-                num_dispatchers=num_dispatchers,
-            )
-        elif moe.use_deepep_ht_kernels:
-            assert moe.dp_size == all2all_manager.dp_world_size
-
-            all_to_all_args = dict()
-            handle = all2all_manager.get_handle(all_to_all_args)
-            prepare_finalize = DeepEPHTPrepareAndFinalize(
-                handle,
-                num_dispatchers=all2all_manager.world_size,
-                dp_size=all2all_manager.dp_world_size,
-                rank_expert_offset=all2all_manager.rank * moe.num_local_experts,
-            )
-
-        elif moe.use_deepep_ll_kernels:
-            assert quant_config is not None
-            all_to_all_args = dict(
-                max_num_tokens_per_dp_rank=moe.max_num_tokens,
-                token_hidden_size=moe.hidden_dim,
-                num_ep_ranks=all2all_manager.world_size,
-                num_global_experts=moe.num_experts,
-                num_local_experts=moe.num_experts // all2all_manager.world_size,
-            )
-            handle = all2all_manager.get_handle(all_to_all_args)
-
-            # Note: We may want to use FP8 dispatch just to reduce
-            # data movement.
-            use_fp8_dispatch = (
-                quant_config.quant_dtype == current_platform.fp8_dtype()
-                and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE
-            )
-
-            prepare_finalize = DeepEPLLPrepareAndFinalize(
-                handle,
-                max_tokens_per_rank=moe.max_num_tokens,
-                num_dispatchers=all2all_manager.world_size,
-                use_fp8_dispatch=use_fp8_dispatch,
-            )
-
-        return prepare_finalize
-
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
-        if self.moe.moe_parallel_config.use_all2all_kernels:
-            return FusedMoEMethodBase._maybe_make_prepare_finalize(
-                self.moe, self.moe_quant_config
-            )
-        else:
-            return None
-
-    def maybe_init_modular_kernel(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEModularKernel | None:
-        assert self.moe is not None
-
-        # We must get the quant config here so that the layer is
-        # completely initialized, i.e. all weights loaded and post
-        # processed.
-        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
-
-        prepare_finalize = self.maybe_make_prepare_finalize()
-
-        if prepare_finalize is not None:
-            logger.debug(
-                "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
-            )
-            experts = self.select_gemm_impl(prepare_finalize, layer)
-            return FusedMoEModularKernel(
-                prepare_finalize,
-                experts,
-                layer.shared_experts,
-            )
-        else:
-            return None
-
-    def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        layer: torch.nn.Module,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        # based on the all2all implementation, select the appropriate
-        # gemm implementation
-        raise NotImplementedError(
-            f"{self.__class__.__name__} must select appropriate gemm "
-            "implementation based on the prepare_finalize"
-        )
-
-    @abstractmethod
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        raise NotImplementedError
-
-    @property
-    def topk_indices_dtype(self) -> torch.dtype | None:
-        return None
-
-    @property
-    def supports_eplb(self) -> bool:
-        return False
-
-    @property
-    def allow_inplace(self) -> bool:
-        return False
-
-    @abstractmethod
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        raise NotImplementedError
-
-
-@CustomOp.register("modular_fused_moe")
-class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
-    def __init__(
-        self,
-        old_quant_method: FusedMoEMethodBase,
-        fused_experts: FusedMoEModularKernel,
-    ):
-        super().__init__(old_quant_method.moe)
-        # Find better way to copy attributes?  Should we even copy attributes?
-        # self.__dict__.update(old_quant_method.__dict__)
-        self.moe_quant_config = old_quant_method.moe_quant_config
-        self.fused_experts = fused_experts
-        self.disable_expert_map = getattr(
-            old_quant_method,
-            "disable_expert_map",
-            not fused_experts.supports_expert_map(),
-        )
-        self.old_quant_method = old_quant_method
-        logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__)
-
-    @property
-    def topk_indices_dtype(self) -> torch.dtype | None:
-        return self.fused_experts.prepare_finalize.topk_indices_dtype()
-
-    @property
-    def supports_eplb(self) -> bool:
-        return self.old_quant_method.supports_eplb
-
-    @property
-    def allow_inplace(self) -> bool:
-        return self.old_quant_method.allow_inplace
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        raise NotImplementedError
-
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        return self.moe_quant_config
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        # Is getattr needed?
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        if enable_eplb:
-            if self.supports_eplb:
-                assert expert_load_view is not None
-                assert logical_to_physical_map is not None
-                assert logical_replica_count is not None
-                assert isinstance(layer, FusedMoE)
-            else:
-                raise NotImplementedError(
-                    "EPLB is not supported for "
-                    f"{self.old_quant_method.__class__.__name__}."
-                )
-
-        topk_weights, topk_ids, zero_expert_result = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-        )
-
-        result = self.fused_experts(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=self.allow_inplace,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            expert_map=None if self.disable_expert_map else expert_map,
-        )
-
-        if zero_expert_num != 0 and zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
-
-
-@CustomOp.register("unquantized_fused_moe")
-class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
-    """MoE method without quantization."""
-
-    def __init__(self, moe: FusedMoEConfig):
-        super().__init__(moe)
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
-        if self.rocm_aiter_moe_enabled:
-            from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
-
-            self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
-        else:
-            self.rocm_aiter_fused_experts = None  # type: ignore
-
-        # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
-        self.flashinfer_cutlass_moe_enabled = (
-            has_flashinfer_cutlass_fused_moe()
-            and envs.VLLM_USE_FLASHINFER_MOE_FP16
-            and self.moe.moe_parallel_config.use_ep
-            and self.moe.moe_parallel_config.dp_size == 1
-            and current_platform.get_device_capability()[0] >= 9
-        )
-        if self.flashinfer_cutlass_moe_enabled:
-            logger.info_once(
-                "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod"
-            )
-            from functools import partial
-
-            from .flashinfer_cutlass_moe import flashinfer_cutlass_moe
-
-            self.flashinfer_cutlass_moe = partial(
-                flashinfer_cutlass_moe,
-                quant_config=FUSED_MOE_UNQUANTIZED_CONFIG,
-                tp_rank=self.moe.moe_parallel_config.tp_rank,
-                tp_size=self.moe.moe_parallel_config.tp_size,
-                ep_rank=self.moe.moe_parallel_config.ep_rank,
-                ep_size=self.moe.moe_parallel_config.ep_size,
-            )
-        else:
-            if (
-                self.moe.moe_parallel_config.use_ep
-                and self.moe.moe_parallel_config.dp_size == 1
-            ):
-                logger.info_once(
-                    "FlashInfer CUTLASS MoE is available for EP"
-                    " but not enabled, consider setting"
-                    " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
-                    scope="local",
-                )
-            elif self.moe.moe_parallel_config.dp_size > 1:
-                logger.info_once(
-                    "FlashInfer CUTLASS MoE is currently not available for DP.",
-                    scope="local",
-                )
-            self.flashinfer_cutlass_moe = None  # type: ignore
-
-    @property
-    def supports_eplb(self) -> bool:
-        return True
-
-    @property
-    def allow_inplace(self) -> bool:
-        return True
-
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
-        if self.rocm_aiter_moe_enabled:
-            return None
-        else:
-            return super().maybe_make_prepare_finalize()
-
-    def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        layer: torch.nn.Module,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        assert self.moe_quant_config is not None
-        if (
-            prepare_finalize.activation_format
-            == FusedMoEActivationFormat.BatchedExperts
-        ):
-            logger.debug("BatchedTritonExperts %s", self.moe)
-            return BatchedTritonExperts(
-                max_num_tokens=self.moe.max_num_tokens,
-                num_dispatchers=prepare_finalize.num_dispatchers(),
-                quant_config=self.moe_quant_config,
-            )
-        else:
-            logger.debug("TritonExperts %s", self.moe)
-            return TritonExperts(self.moe_quant_config)
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        if self.moe.is_act_and_mul:
-            w13_up_dim = 2 * intermediate_size_per_partition
-        else:
-            w13_up_dim = intermediate_size_per_partition
-        # Fused gate_up_proj (column parallel)
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                w13_up_dim,
-                hidden_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        if self.moe.has_bias:
-            w13_bias = torch.nn.Parameter(
-                torch.zeros(num_experts, w13_up_dim, dtype=params_dtype),
-                requires_grad=False,
-            )
-            layer.register_parameter("w13_bias", w13_bias)
-            set_weight_attrs(w13_bias, extra_weight_attrs)
-        # down_proj (row parallel)
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-        if self.moe.has_bias:
-            w2_bias = torch.nn.Parameter(
-                torch.zeros(num_experts, hidden_size, dtype=params_dtype),
-                requires_grad=False,
-            )
-            layer.register_parameter("w2_bias", w2_bias)
-            set_weight_attrs(w2_bias, extra_weight_attrs)
-
-    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
-        # Pad the weight tensor. This is an optimization on ROCm platform, which
-        # can benefit from tensors located far enough from one another in memory
-        if (
-            envs.VLLM_ROCM_MOE_PADDING
-            and current_platform.is_rocm()
-            and weight.stride(-1) == 1
-            and (weight.stride(-2) * weight.element_size()) % 512 == 0
-        ):
-            num_pad = 256 // weight.element_size()
-            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
-            torch.cuda.empty_cache()
-
-        return weight
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        super().process_weights_after_loading(layer)
-
-        # Padding the weight for better performance on ROCm
-        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
-        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
-        # Lazy import to avoid importing triton.
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            shuffle_weights,
-        )
-
-        if self.rocm_aiter_moe_enabled:
-            shuffled_w13, shuffled_w2 = shuffle_weights(
-                layer.w13_weight.data, layer.w2_weight.data
-            )
-
-            layer.w13_weight.data = shuffled_w13
-            layer.w2_weight.data = shuffled_w2
-
-        if self.flashinfer_cutlass_moe_enabled:
-            # Swap halves to arrange as [w3; w1] (kernel expectation)
-            w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1)
-            w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
-            layer.w13_weight.data = w13_weight_swapped.contiguous()
-
-        if current_platform.is_xpu():
-            import intel_extension_for_pytorch as ipex
-
-            ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
-            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-                layer.w13_weight,
-                layer.w2_weight,
-                use_prepack=True,
-                experts_start_id=ep_rank_start,
-            )
-        elif current_platform.is_cpu():
-            from vllm.model_executor.layers.fused_moe import cpu_fused_moe
-
-            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
-                from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
-
-                dtype_w13 = layer.w13_weight.dtype
-                _, n_w13, k_w13 = layer.w13_weight.size()
-                dtype_w2 = layer.w2_weight.dtype
-                _, n_w2, k_w2 = layer.w2_weight.size()
-                if (
-                    envs.VLLM_CPU_SGL_KERNEL
-                    and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
-                    and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)
-                ):
-                    packed_w13_weight = torch.ops._C.convert_weight_packed(
-                        layer.w13_weight
-                    )
-                    assert packed_w13_weight.size() == layer.w13_weight.size()
-                    layer.w13_weight.copy_(packed_w13_weight)
-                    del packed_w13_weight
-                    packed_w2_weight = torch.ops._C.convert_weight_packed(
-                        layer.w2_weight
-                    )
-                    assert packed_w2_weight.size() == layer.w2_weight.size()
-                    layer.w2_weight.copy_(packed_w2_weight)
-                    layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
-                else:
-                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
-            else:
-                layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-            assert isinstance(layer, FusedMoE)
-
-        return self.forward(
-            x=x,
-            layer=layer,
-            router_logits=router_logits,
-            top_k=top_k,
-            renormalize=renormalize,
-            use_grouped_topk=use_grouped_topk,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            enable_eplb=enable_eplb,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-        )
-
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        if self.moe.has_bias:
-            return biased_moe_quant_config(
-                layer.w13_bias,
-                layer.w2_bias,
-            )
-        else:
-            return FUSED_MOE_UNQUANTIZED_CONFIG
-
-    def forward_cuda(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        topk_weights, topk_ids, zero_expert_result = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
-        )
-
-        if self.rocm_aiter_moe_enabled:
-            result = self.rocm_aiter_fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                expert_map=expert_map,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
-        elif self.flashinfer_cutlass_moe_enabled:
-            return self.flashinfer_cutlass_moe(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
-        else:
-            result = fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-            )
-
-        if zero_expert_num != 0 and zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
-
-    def forward_cpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
-        ):
-            raise NotImplementedError("Expert load balancing is not supported for CPU.")
-        return layer.cpu_fused_moe(
-            layer,
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            global_num_experts,
-            expert_map,
-            custom_routing_function,
-            scoring_func,
-            routed_scaling_factor,
-            e_score_correction_bias,
-            apply_router_weight_on_input,
-            activation,
-        )
-
-    def forward_xpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
-        ):
-            raise NotImplementedError("Expert load balancing is not supported for XPU.")
-        return layer.ipex_fusion(
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            custom_routing_function=custom_routing_function,
-        )
-
-    def forward_tpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert not use_grouped_topk
-        assert num_expert_group is None
-        assert topk_group is None
-        assert custom_routing_function is None
-        assert apply_router_weight_on_input is False
-        if scoring_func != "softmax":
-            raise NotImplementedError(
-                "Only softmax scoring function is supported for TPU."
-            )
-        if e_score_correction_bias is not None:
-            raise NotImplementedError(
-                "Expert score correction bias is not supported for TPU."
-            )
-        assert activation == "silu", f"{activation} is not supported for TPU."
-        assert routed_scaling_factor == 1.0, (
-            f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU."
-        )
-        if (
-            enable_eplb is not False
-            or expert_load_view is not None
-            or logical_to_physical_map is not None
-            or logical_replica_count is not None
-        ):
-            raise NotImplementedError("Expert load balancing is not supported for TPU.")
-        return fused_moe_pallas(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk=top_k,
-            gating_output=router_logits,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            renormalize=renormalize,
-        )
-
-    if current_platform.is_tpu():
-        forward_native = forward_tpu
-    elif current_platform.is_cpu():
-        forward_native = forward_cpu
-    elif current_platform.is_xpu():
-        forward_native = forward_xpu
-    else:
-        forward_native = forward_cuda
-
-
 def determine_expert_map(
     ep_size: int,
     ep_rank: int,
     global_num_experts: int,
     expert_placement_strategy: ExpertPlacementStrategy = "linear",
     num_fused_shared_experts: int = 0,
+    return_expert_mask: bool = False,
 ) -> tuple[int, torch.Tensor | None, torch.Tensor | None]:
     """
     Calculates how many experts should be assigned to each rank for EP and
@@ -1064,7 +172,7 @@ def determine_expert_map(
         )
 
     expert_mask = None
-    if is_rocm_aiter_moe_enabled():
+    if return_expert_mask:
         expert_mask = torch.ones(
             (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32
         )
@@ -1129,16 +237,13 @@ def maybe_roundup_hidden_size(
         Rounded up hidden_size if rounding up is required based on the configs.
         Original hidden size otherwise.
     """
+    from vllm.model_executor.layers.fused_moe.all2all_utils import (
+        maybe_roundup_layer_hidden_size,
+    )
 
-    if moe_parallel_config.use_deepep_ht_kernels:
-        hidden_size = DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size(
-            hidden_size, act_dtype
-        )
-
-    if moe_parallel_config.use_deepep_ll_kernels:
-        hidden_size = DeepEPLLPrepareAndFinalize.maybe_roundup_layer_hidden_size(
-            hidden_size
-        )
+    hidden_size = maybe_roundup_layer_hidden_size(
+        hidden_size, act_dtype, moe_parallel_config
+    )
 
     # we are padding globally so EP buffer allocation works
     if quant_config and quant_config.get_name() == "mxfp4":
@@ -1218,6 +323,7 @@ def __init__(
         zero_expert_type: str | None = None,
         expert_mapping: list[tuple[str, str, int, str]] | None = None,
         n_shared_experts: int | None = None,
+        routing_method_type: int | None = None,
     ):
         super().__init__()
 
@@ -1229,7 +335,11 @@ def __init__(
             logger.info_once("Disabling MoE shared_experts cuda stream")
             self.shared_experts_stream = None
         else:
-            self.shared_experts_stream = torch.cuda.Stream()
+            # TODO(rob): enable shared expert overlap with non-cuda.
+            # aux_stream() returns None on non-cuda platforms.
+            self.shared_experts_stream = aux_stream()
+            if self.shared_experts_stream is not None:
+                logger.info_once("Enabled separate cuda stream for MoE shared_experts")
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -1292,14 +402,18 @@ def __init__(
         self.logical_replica_count: torch.Tensor | None = None
 
         # ROCm aiter shared experts fusion
+        self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        self.aiter_fmoe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
+
         self.num_fused_shared_experts = (
             n_shared_experts
-            if n_shared_experts is not None
-            and is_rocm_aiter_fusion_shared_expert_enabled()
+            if n_shared_experts is not None and self.aiter_fmoe_shared_expert_enabled
             else 0
         )
         if (
-            not is_rocm_aiter_fusion_shared_expert_enabled()
+            not self.aiter_fmoe_shared_expert_enabled
             and self.num_fused_shared_experts != 0
         ):
             raise ValueError(
@@ -1346,6 +460,7 @@ def __init__(
                 global_num_experts=self.global_num_experts,
                 expert_placement_strategy=expert_placement_strategy,
                 num_fused_shared_experts=self.num_fused_shared_experts,
+                return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
@@ -1397,6 +512,24 @@ def __init__(
                 "Only softmax scoring function is supported for non-grouped topk."
             )
 
+        # ToDo: Better logic to determine the routing method type
+        if routing_method_type is not None:
+            self.routing_method_type = routing_method_type
+        else:
+            if scoring_func == "sigmoid":
+                if self.use_grouped_topk:
+                    self.routing_method_type = RoutingMethodType.DeepSeekV3
+                elif self.top_k == 1:
+                    self.routing_method_type = RoutingMethodType.Llama4
+            elif self.scoring_func == "softmax":
+                self.routing_method_type = (
+                    RoutingMethodType.Renormalize
+                    if not self.renormalize
+                    else RoutingMethodType.RenormalizeNaive
+                )
+            else:
+                self.routing_method_type = RoutingMethodType.TopK
+
         self.moe_config: FusedMoEConfig = FusedMoEConfig(
             num_experts=self.global_num_experts,
             experts_per_token=top_k,
@@ -1410,7 +543,6 @@ def __init__(
             is_lora_enabled=vllm_config.lora_config is not None,
         )
 
-        self.moe_quant_config: FusedMoEQuantConfig | None = None
         self.quant_config = quant_config
 
         def _get_quant_method() -> FusedMoEMethodBase:
@@ -1488,9 +620,15 @@ def _get_quant_method() -> FusedMoEMethodBase:
     # This is called after all weight loading and post-processing, so it
     # should be safe to swap out the quant_method.
     def maybe_init_modular_kernel(self) -> None:
-        mk = self.quant_method.maybe_init_modular_kernel(self)
-        if mk is not None:
-            self.quant_method = FusedMoEModularMethod(self.quant_method, mk)
+        self.ensure_moe_quant_config_init()
+        prepare_finalize = self.quant_method.maybe_make_prepare_finalize()
+        if prepare_finalize is not None:
+            logger.debug(
+                "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
+            )
+            self.quant_method = FusedMoEModularMethod.make(
+                self, self.quant_method, prepare_finalize, self.shared_experts
+            )
 
     @property
     def shared_experts(self) -> torch.nn.Module | None:
@@ -1548,6 +686,10 @@ def use_flashinfer_cutlass_kernels(self):
             and self.moe_config.use_flashinfer_cutlass_kernels
         )
 
+    @property
+    def use_marlin_kernels(self):
+        return getattr(self.quant_method, "use_marlin", False)
+
     @property
     def use_dp_chunking(self) -> bool:
         return (
@@ -1570,13 +712,16 @@ def update_expert_map(self):
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts,
                 num_fused_shared_experts=self.num_fused_shared_experts,
+                return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
-            self._init_aiter_shared_experts_topK_buffer(
-                vllm_config=get_current_vllm_config(), dp_size=get_dp_group().world_size
-            )
+            if self.aiter_fmoe_shared_expert_enabled:
+                self._init_aiter_shared_experts_topK_buffer(
+                    vllm_config=get_current_vllm_config(),
+                    dp_size=get_dp_group().world_size,
+                )
 
     def _load_per_tensor_weight_scale(
         self,
@@ -1753,20 +898,19 @@ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
     def _init_aiter_shared_experts_topK_buffer(
         self, vllm_config: VllmConfig, dp_size: int
     ):
-        if is_rocm_aiter_fusion_shared_expert_enabled():
-            if self.num_fused_shared_experts > 0:
-                init_aiter_topK_meta_data(
-                    n_routed_experts=self.global_num_experts,
-                    n_shared_experts=self.num_fused_shared_experts,
-                    top_k=self.top_k,
-                    tp_rank=self.ep_rank if self.use_ep else self.tp_rank,
-                    tp_size=self.ep_size if self.use_ep else self.tp_size,
-                    shared_experts_score=1.0,
-                    max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens
-                    * dp_size,
-                    is_EP=self.use_ep,
-                )
-            self.local_num_experts += self.num_fused_shared_experts
+        if self.num_fused_shared_experts > 0:
+            init_aiter_topK_meta_data(
+                n_routed_experts=self.global_num_experts,
+                n_shared_experts=self.num_fused_shared_experts,
+                top_k=self.top_k,
+                tp_rank=self.ep_rank if self.use_ep else self.tp_rank,
+                tp_size=self.ep_size if self.use_ep else self.tp_size,
+                shared_experts_score=1.0,
+                max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens
+                * dp_size,
+                is_EP=self.use_ep,
+            )
+        self.local_num_experts += self.num_fused_shared_experts
 
     @overload
     def weight_loader(
@@ -2082,7 +1226,11 @@ def load_weights(
 
     def get_expert_weights(self) -> Iterable[torch.Tensor]:
         weights = list(self.named_parameters())
-        assert all(weight.is_contiguous() for _, weight in weights)
+        assert all(
+            weight.is_contiguous()
+            for name, weight in weights
+            if not name.startswith("_shared_experts.")
+        )
 
         # Filter out the non-expert weights.
         # `e_score_correction_bias` is a bias for each logical expert,
@@ -2118,22 +1266,18 @@ def set_eplb_state(
         self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx]
         self.logical_replica_count = logical_replica_count[moe_layer_idx]
 
-    def get_sp_ctx(self):
-        ctx = get_forward_context()
-        return (
-            ctx.dp_metadata.sp_local_sizes(self.sp_size)
-            if ctx.dp_metadata
-            else nullcontext()
-        )
-
     def ensure_moe_quant_config_init(self):
         if self.quant_method.moe_quant_config is None:
+            # Note: the moe_quant_config can't be constructed until after
+            # weight loading post processing.
             self.quant_method.moe_quant_config = (
                 self.quant_method.get_fused_moe_quant_config(self)
             )
 
-        if self.moe_quant_config is None:
-            self.moe_quant_config = self.quant_method.moe_quant_config
+    @property
+    def moe_quant_config(self) -> FusedMoEQuantConfig | None:
+        self.ensure_moe_quant_config_init()
+        return self.quant_method.moe_quant_config
 
     def ensure_dp_chunking_init(self):
         if not self.use_dp_chunking or self.batched_hidden_states is not None:
@@ -2216,15 +1360,16 @@ def select_experts(
         elif use_grouped_topk:
             assert topk_group is not None
             assert num_expert_group is not None
-            if is_rocm_aiter_moe_enabled():
-                if not is_rocm_aiter_fusion_shared_expert_enabled():
+            if rocm_aiter_ops.is_fused_moe_enabled():
+                if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
                     assert num_fused_shared_experts == 0
                 grouped_topk_impl = partial(
-                    grouped_topk_aiter,
+                    rocm_aiter_grouped_topk,
                     num_fused_shared_experts=num_fused_shared_experts,
                 )
             else:
                 grouped_topk_impl = grouped_topk
+
             topk_weights, topk_ids = grouped_topk_impl(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
@@ -2340,35 +1485,16 @@ def forward_native(
                 mode="constant",
                 value=0.0,
             )
-        do_naive_dispatch_combine: bool = self.dp_size > 1 and not isinstance(
-            self.quant_method, FusedMoEModularMethod
-        )
 
-        sp_ctx = self.get_sp_ctx()
-        with sp_ctx:
-            if do_naive_dispatch_combine:
-                hidden_states, router_logits = get_ep_group().dispatch(
-                    hidden_states, router_logits, self.is_sequence_parallel
-                )
-
-        def reduce_output(
-            states: torch.Tensor, do_combine: bool = True
-        ) -> torch.Tensor:
-            # Reinitialize the context manager
-            # as it was reset in the forward_impl.
-            sp_ctx = self.get_sp_ctx()
-            with sp_ctx:
-                if do_naive_dispatch_combine and do_combine:
-                    states = get_ep_group().combine(states, self.is_sequence_parallel)
-
-                if (
-                    not self.is_sequence_parallel
-                    and not self.use_dp_chunking
-                    and self.reduce_results
-                    and (self.tp_size > 1 or self.ep_size > 1)
-                ):
-                    states = self.maybe_all_reduce_tensor_model_parallel(states)
-                return states
+        def reduce_output(states: torch.Tensor) -> torch.Tensor:
+            if (
+                not self.is_sequence_parallel
+                and not self.use_dp_chunking
+                and self.reduce_results
+                and (self.tp_size > 1 or self.ep_size > 1)
+            ):
+                states = self.maybe_all_reduce_tensor_model_parallel(states)
+            return states
 
         if self.shared_experts is None:
             if current_platform.is_tpu():
@@ -2400,7 +1526,7 @@ def reduce_output(
                     hidden_states, router_logits, self.layer_name
                 )
             return (
-                reduce_output(shared_output, do_combine=False)[..., :og_hidden_states],
+                reduce_output(shared_output)[..., :og_hidden_states],
                 reduce_output(fused_output)[..., :og_hidden_states],
             )
 
@@ -2460,28 +1586,6 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
             staged_hidden_states.copy_(hidden_states, non_blocking=True)
             staged_router_logits.copy_(router_logits, non_blocking=True)
 
-            # If there are shared experts but we are not using a modular kernel,
-            # the shared experts must be called here
-            if has_separate_shared_experts:
-                assert self.shared_experts is not None
-
-                if self.shared_experts_stream is not None:
-                    # For chunked, we start the shared experts stream here
-                    # (Note that no concurrency with the router/gate)
-                    self.shared_experts_stream.wait_stream(current_stream())
-
-                    with torch.cuda.stream(self.shared_experts_stream):
-                        # Note that staged_hidden_states clone() is necessary
-                        # here to avoid conflict with the main stream
-                        shared_output = self.shared_experts(
-                            staged_hidden_states.clone()
-                        )
-                else:
-                    shared_output = self.shared_experts(staged_hidden_states)
-
-            else:
-                shared_output = None
-
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
@@ -2492,7 +1596,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
                 use_grouped_topk=self.use_grouped_topk,
                 global_num_experts=self.global_num_experts,
                 expert_map=self.expert_map
-                if not is_rocm_aiter_moe_enabled()
+                if not self.rocm_aiter_fmoe_enabled
                 else self.expert_mask,
                 topk_group=self.topk_group,
                 num_expert_group=self.num_expert_group,
@@ -2511,9 +1615,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
                 assert not isinstance(final_hidden_states, tuple)
                 assert self.shared_experts is not None
 
-                # Here we finish the shared experts stream
-                if self.shared_experts_stream is not None:
-                    current_stream().wait_stream(self.shared_experts_stream)
+                shared_output = self.shared_experts(staged_hidden_states)
 
                 final_hidden_states = (
                     shared_output,
@@ -2592,13 +1694,34 @@ def forward_impl(
 
         use_chunked_impl = self.use_dp_chunking
 
-        if (
+        use_shared_experts_stream = (
             has_separate_shared_experts
             and not use_chunked_impl
             and self.shared_experts_stream is not None
-        ):
-            # Start the separate shared experts stream here since we want
-            # to run in parallel with the router/gate (next op below)
+            and (
+                hidden_states.shape[0]
+                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+            )
+        )
+
+        if use_shared_experts_stream:
+            assert self.shared_experts_stream is not None
+
+            # Clone BEFORE switching streams to avoid race condition
+            # where routed_expert kernel may mutate hidden_states.
+            hidden_states_clone = hidden_states.clone()
+
+            # Record that the clone will be used by shared_experts_stream
+            # to avoid gc issue from deallocation of hidden_states_clone
+            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
+            # NOTE: We dont need shared_output.record_stream(current_stream())
+            # because we synch the streams before using shared_output.
+            hidden_states_clone.record_stream(self.shared_experts_stream)
+
+            # Mark sync start point for the separate shared experts
+            # stream here since we want to run in parallel with the
+            # router/gate (next op below)
+            assert self.shared_experts_stream is not None
             self.shared_experts_stream.wait_stream(current_stream())
 
         # If router/gate provided, then apply it here.
@@ -2613,36 +1736,36 @@ def forward_impl(
                 hidden_states, router_logits, has_separate_shared_experts
             )
 
-        # If there are shared experts but we are not using a modular kernel, the
-        # shared experts must be called here
-        if has_separate_shared_experts:
-            assert self.shared_experts is not None
-
-            if self.shared_experts_stream is not None:
-                # Run shared experts in parallel on a separate stream
-                with torch.cuda.stream(self.shared_experts_stream):
-                    # Note that hidden_states clone() is necessary here to avoid
-                    # conflict with the main stream
-                    shared_output = self.shared_experts(hidden_states.clone())
-            else:
-                shared_output = self.shared_experts(hidden_states)
-        else:
-            shared_output = None
+        do_naive_dispatch_combine: bool = self.dp_size > 1 and not isinstance(
+            self.quant_method, FusedMoEModularMethod
+        )
 
-        sp_ctx = self.get_sp_ctx()
+        ctx = get_forward_context()
+        sp_ctx = (
+            ctx.dp_metadata.sp_local_sizes(self.sp_size)
+            if ctx.dp_metadata
+            else nullcontext()
+        )
 
         with sp_ctx:
+            if do_naive_dispatch_combine:
+                hidden_states_combined, router_logits = get_ep_group().dispatch(
+                    hidden_states, router_logits, self.is_sequence_parallel
+                )
+
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
-                x=hidden_states,
+                x=hidden_states_combined
+                if do_naive_dispatch_combine
+                else hidden_states,
                 router_logits=router_logits,
                 top_k=self.top_k,
                 renormalize=self.renormalize,
                 use_grouped_topk=self.use_grouped_topk,
                 global_num_experts=self.global_num_experts,
                 expert_map=self.expert_map
-                if not is_rocm_aiter_moe_enabled()
+                if not self.rocm_aiter_fmoe_enabled
                 else self.expert_mask,
                 topk_group=self.topk_group,
                 num_expert_group=self.num_expert_group,
@@ -2659,18 +1782,45 @@ def forward_impl(
             )
 
             if has_separate_shared_experts:
-                assert not isinstance(final_hidden_states, tuple)
                 assert self.shared_experts is not None
 
-                # Wait for the parallel shared experts stream to finish here
-                if self.shared_experts_stream is not None:
+                if use_shared_experts_stream:
+                    # Run shared experts in parallel on a separate stream
+                    # NOTE: We start the separate stream here and mark the
+                    # sync end point immediately after it is done. This is
+                    # important to avoid excessive stream allocations by the cuda
+                    # graph replay later.
+                    with torch.cuda.stream(self.shared_experts_stream):
+                        # Note that hidden_states clone() is necessary here to avoid
+                        # conflict with the main stream
+                        shared_output = self.shared_experts(hidden_states_clone)
                     current_stream().wait_stream(self.shared_experts_stream)
+                else:
+                    shared_output = self.shared_experts(hidden_states)
 
                 final_hidden_states = (
                     shared_output,
                     final_hidden_states,
                 )
-            return final_hidden_states
+            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
+                assert isinstance(final_hidden_states, tuple)
+                final_hidden_states, zero_expert_result = final_hidden_states
+
+            def combine_output(states: torch.Tensor) -> torch.Tensor:
+                if do_naive_dispatch_combine:
+                    states = get_ep_group().combine(states, self.is_sequence_parallel)
+                return states
+
+            if self.shared_experts is not None:
+                return (
+                    final_hidden_states[0],
+                    combine_output(final_hidden_states[1]),
+                )
+            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
+                assert isinstance(final_hidden_states, torch.Tensor)
+                return (combine_output(final_hidden_states), zero_expert_result)
+            else:
+                return combine_output(final_hidden_states)
 
     @classmethod
     def make_expert_params_mapping(
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index b5fa2c71bec5..093affe51f50 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -149,6 +149,15 @@ class FusedMoEPrepareAndFinalize(ABC):
     described above.
     """
 
+    def post_init_setup(self, fused_experts: "FusedMoEPermuteExpertsUnpermute"):
+        """
+        Initialize FusedMoEPrepareAndFinalize settings that depend on
+        FusedMoEPermuteExpertsUnpermute experts object.
+        The FusedMoEPrepareAndFinalize implementations that have such
+        dependencies may choose to override this function.
+        """
+        return
+
     @abstractmethod
     def prepare(
         self,
@@ -503,6 +512,13 @@ def supports_expert_map(self) -> bool:
         """
         raise NotImplementedError
 
+    def supports_packed_ue8m0_act_scales(self) -> bool:
+        """
+        A flag indicating whether or not this class can process packed ue8m0
+        activation scales.
+        """
+        return False
+
     def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
         """
         Workspace type: The dtype to use for the workspace tensors.
@@ -698,6 +714,8 @@ def __init__(
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
         self.shared_experts = shared_experts
+
+        self._post_init_setup()
         assert (
             prepare_finalize.activation_format == fused_experts.activation_formats[0]
         ), (
@@ -707,6 +725,13 @@ def __init__(
             f"{fused_experts.activation_formats[0]}"
         )
 
+    def _post_init_setup(self):
+        """
+        Resolve any leftover setup dependencies between self.prepare_finalize
+        and self.fused_experts here.
+        """
+        self.prepare_finalize.post_init_setup(self.fused_experts)
+
     def supports_expert_map(self) -> bool:
         """
         A flag indicating whether or not this class supports expert maps.
@@ -1035,7 +1060,7 @@ def input_chunk_range(chunk_idx: int) -> tuple[int, int]:
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
                 a1q_scale=_slice_scales(a1q_scale, s, e),
-                a2_scale=_slice_scales(self.fused_experts.a2_scale, e, e),
+                a2_scale=_slice_scales(self.fused_experts.a2_scale, s, e),
                 workspace13=workspace13,
                 workspace2=workspace2,
                 expert_tokens_meta=c_expert_tokens_meta,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index e18514ad43f6..8f05828d74f5 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,17 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import IntEnum
-from functools import cache, lru_cache
+from functools import lru_cache
 
 import torch
 
-from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
     FusedMoEQuantConfig,
 )
-from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
 
 
 class QuantMethod(IntEnum):
@@ -37,27 +35,6 @@ class ActivationMethod(IntEnum):
     GELU = 1
 
 
-@cache
-def is_rocm_aiter_moe_enabled() -> bool:
-    return (
-        current_platform.is_rocm()
-        and envs.VLLM_ROCM_USE_AITER_MOE
-        and envs.VLLM_ROCM_USE_AITER
-    )
-
-
-@cache
-def use_mxfp4_aiter_moe() -> bool:
-    return current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER
-
-
-@cache
-def is_rocm_aiter_fusion_shared_expert_enabled() -> bool:
-    return (
-        envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS and is_rocm_aiter_moe_enabled()
-    )
-
-
 aiter_topK_meta_data = None
 
 
@@ -114,250 +91,6 @@ def init_aiter_topK_meta_data(
     aiter_topK_meta_data = (total_topk_weights, total_topk_ids)
 
 
-def rocm_aiter_asm_moe_tkw1_impl(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    fc1_scale: torch.Tensor | None = None,
-    fc2_scale: torch.Tensor | None = None,
-    fc1_smooth_scale: torch.Tensor | None = None,
-    fc2_smooth_scale: torch.Tensor | None = None,
-    a16: bool = False,
-    per_tensor_quant_scale: torch.Tensor | None = None,
-    expert_mask: torch.Tensor | None = None,
-    activation_method: int = ActivationMethod.SILU.value,
-) -> torch.Tensor:
-    from aiter import ActivationType
-    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
-
-    activation = ActivationType(activation_method)
-
-    return asm_moe_tkw1(
-        hidden_states,
-        w1,
-        w2,
-        topk_weights,
-        topk_ids,
-        fc1_scale=fc1_scale,
-        fc2_scale=fc2_scale,
-        fc1_smooth_scale=fc1_smooth_scale,
-        fc2_smooth_scale=fc2_smooth_scale,
-        a16=a16,
-        per_tensor_quant_scale=per_tensor_quant_scale,
-        expert_mask=expert_mask,
-        activation=activation,
-    )
-
-
-def rocm_aiter_asm_moe_tkw1_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    fc1_scale: torch.Tensor | None = None,
-    fc2_scale: torch.Tensor | None = None,
-    fc1_smooth_scale: torch.Tensor | None = None,
-    fc2_smooth_scale: torch.Tensor | None = None,
-    a16: bool = False,
-    per_tensor_quant_scale: torch.Tensor | None = None,
-    expert_mask: torch.Tensor | None = None,
-    activation_method: int = ActivationMethod.SILU.value,
-) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-def rocm_aiter_topk_softmax_impl(
-    topk_weights: torch.Tensor,
-    topk_indices: torch.Tensor,
-    token_expert_indices: torch.Tensor,
-    gating_output: torch.Tensor,
-    renormalize: bool,
-) -> None:
-    from aiter import topk_softmax
-
-    topk_softmax(
-        topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
-    )
-
-
-def rocm_aiter_topk_softmax_fake(
-    topk_weights: torch.Tensor,
-    topk_indices: torch.Tensor,
-    token_expert_indices: torch.Tensor,
-    gating_output: torch.Tensor,
-    renormalize: bool,
-) -> None:
-    pass
-
-
-def rocm_aiter_biased_grouped_topk_impl(
-    gating_output: torch.Tensor,
-    correction_bias: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_expert_group: int,
-    topk_group: int,
-    need_renorm: bool,
-    routed_scaling_factor: float = 1.0,  # mul to topk_weights
-) -> None:
-    from aiter import biased_grouped_topk
-
-    biased_grouped_topk(
-        gating_output,
-        correction_bias,
-        topk_weights,
-        topk_ids,
-        num_expert_group,
-        topk_group,
-        need_renorm,
-        routed_scaling_factor,
-    )
-
-
-def rocm_aiter_biased_grouped_topk_fake(
-    gating_output: torch.Tensor,
-    correction_bias: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_expert_group: int,
-    topk_group: int,
-    need_renorm: bool,
-    routed_scaling_factor: float = 1.0,  # mul to topk_weights
-) -> None:
-    pass
-
-
-def rocm_aiter_grouped_topk_impl(
-    gating_output: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_expert_group: int,
-    topk_group: int,
-    need_renorm: bool,
-    scoring_func: str = "softmax",
-    routed_scaling_factor: float = 1.0,  # mul to topk_weights
-) -> None:
-    from aiter import grouped_topk
-
-    grouped_topk(
-        gating_output,
-        topk_weights,
-        topk_ids,
-        num_expert_group,
-        topk_group,
-        need_renorm,
-        scoring_func,
-        routed_scaling_factor,
-    )
-
-
-def rocm_aiter_grouped_topk_fake(
-    gating_output: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_expert_group: int,
-    topk_group: int,
-    need_renorm: bool,
-    scoring_func: str = "softmax",
-    routed_scaling_factor: float = 1.0,  # mul to topk_weights
-) -> None:
-    pass
-
-
-def rocm_aiter_fused_moe_impl(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-    expert_mask: torch.Tensor | None = None,
-    activation_method: int = ActivationMethod.SILU.value,
-    quant_method: int = QuantMethod.NO.value,
-    doweight_stage1: bool = False,
-    w1_scale: torch.Tensor | None = None,
-    w2_scale: torch.Tensor | None = None,
-    a1_scale: torch.Tensor | None = None,
-    a2_scale: torch.Tensor | None = None,
-) -> torch.Tensor:
-    from aiter import ActivationType, QuantType
-    from aiter.fused_moe import fused_moe
-
-    activation = ActivationType(activation_method)
-    quant_type = QuantType(quant_method)
-
-    return fused_moe(
-        hidden_states,
-        w1,
-        w2,
-        topk_weight,
-        topk_ids,
-        expert_mask,
-        activation,
-        quant_type,
-        doweight_stage1,
-        w1_scale,
-        w2_scale,
-        a1_scale,
-        a2_scale,
-    )
-
-
-def rocm_aiter_fused_moe_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-    expert_mask: torch.Tensor | None = None,
-    activation_method: int = ActivationMethod.SILU.value,
-    quant_method: int = QuantMethod.NO.value,
-    doweight_stage1: bool = False,
-    w1_scale: torch.Tensor | None = None,
-    w2_scale: torch.Tensor | None = None,
-    a1_scale: torch.Tensor | None = None,
-    a2_scale: torch.Tensor | None = None,
-) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_asm_moe_tkw1",
-        op_func=rocm_aiter_asm_moe_tkw1_impl,
-        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_fused_moe",
-        op_func=rocm_aiter_fused_moe_impl,
-        fake_impl=rocm_aiter_fused_moe_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_topk_softmax",
-        op_func=rocm_aiter_topk_softmax_impl,
-        mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
-        fake_impl=rocm_aiter_topk_softmax_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_biased_grouped_topk",
-        op_func=rocm_aiter_biased_grouped_topk_impl,
-        mutates_args=["topk_weights", "topk_ids"],
-        fake_impl=rocm_aiter_biased_grouped_topk_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_grouped_topk",
-        op_func=rocm_aiter_grouped_topk_impl,
-        mutates_args=["topk_weights", "topk_ids"],
-        fake_impl=rocm_aiter_grouped_topk_fake,
-    )
-
-
 def rocm_aiter_grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -372,7 +105,10 @@ def rocm_aiter_grouped_topk(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     token = hidden_states.shape[0]
     device = hidden_states.device
-    if is_rocm_aiter_fusion_shared_expert_enabled() and num_fused_shared_experts > 0:
+    if (
+        rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        and num_fused_shared_experts > 0
+    ):
         assert aiter_topK_meta_data is not None, (
             "AITER topK meta data is not initialized. "
             "Please ensure that init_aiter_topK_meta_data "
@@ -397,7 +133,7 @@ def rocm_aiter_grouped_topk(
         topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
 
     if e_score_correction_bias is not None:
-        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+        rocm_aiter_ops.biased_grouped_topk(
             gating_output,
             e_score_correction_bias.to(gating_output.dtype),
             topk_weights,
@@ -409,7 +145,7 @@ def rocm_aiter_grouped_topk(
         )
     else:
         assert scoring_func == "softmax" or scoring_func == "sigmoid"
-        torch.ops.vllm.rocm_aiter_grouped_topk(
+        rocm_aiter_ops.grouped_topk(
             gating_output,
             topk_weights,
             topk_ids,
@@ -420,7 +156,10 @@ def rocm_aiter_grouped_topk(
             routed_scaling_factor=routed_scaling_factor,
         )
 
-    if is_rocm_aiter_fusion_shared_expert_enabled() and num_fused_shared_experts > 0:
+    if (
+        rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        and num_fused_shared_experts > 0
+    ):
         return total_topk_weights, total_topk_ids
     return topk_weights, topk_ids
 
@@ -464,7 +203,7 @@ def rocm_aiter_fused_experts(
             "Only support topk=1 when `apply_router_weight_on_input` is True"
         )
 
-        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+        return rocm_aiter_ops.asm_moe_tkw1(
             hidden_states,
             w1,
             w2,
@@ -482,7 +221,9 @@ def rocm_aiter_fused_experts(
 
     else:
         quant_method = QuantMethod.NO.value
-
+        # quark moe for mxfp4 w_dtype
+        if quant_config.use_mxfp4_w4a16:
+            quant_method = QuantMethod.BLOCK_1X32.value
         # w8a8 block-scaled
         if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
             assert not apply_router_weight_on_input, (
@@ -507,7 +248,7 @@ def rocm_aiter_fused_experts(
                 "Only support topk=1 when `apply_router_weight_on_input` is True"
             )
 
-        return torch.ops.vllm.rocm_aiter_fused_moe(
+        return rocm_aiter_ops.fused_moe(
             hidden_states,
             w1,
             w2,
@@ -522,39 +263,3 @@ def rocm_aiter_fused_experts(
             a2_scale=quant_config.a2_scale,
             doweight_stage1=apply_router_weight_on_input,
         )
-
-
-def rocm_aiter_topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_indices: torch.Tensor,
-    token_expert_indices: torch.Tensor,
-    gating_output: torch.Tensor,
-    renormalize: bool,
-) -> tuple[torch.Tensor, ...]:
-    torch.ops.vllm.rocm_aiter_topk_softmax(
-        topk_weights, topk_indices, token_expert_indices, gating_output, renormalize
-    )
-    return topk_weights, topk_indices
-
-
-def shuffle_weights(
-    *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
-) -> tuple[torch.Tensor, ...]:
-    """
-    Applies shuffle_weight function from AITER to each
-    input tensor and returns them.
-
-    Rearranges (shuffles) the input tensor/s
-    into a specified block layout for optimized computation.
-
-    Args:
-        *tensors: Variable number of torch.Tensor objects.
-        layout: A pair of integers specifying the block sizes used to divide
-            the tensors during shuffling. Default is (16, 16).
-
-    Returns:
-    A Tuple of shuffled tensors.
-    """
-    from aiter.ops.shuffle import shuffle_weight
-
-    return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index 6b4a0b8cf073..6ec8b33ed930 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -28,13 +28,18 @@ def __init__(
         super().__init__(**kwargs)
         self._shared_experts = shared_experts
 
-        # Disable shared expert overlap if we are not using
-        # flashinfer + DP since there is nothing to be gained in this case.
-        # Disabling the overlap optimization also prevents the shared experts
-        # from being hidden from torch.compile.
+        # Disable shared expert overlap if:
+        #   - we are using eplb, because of correctness issues
+        #   - we are using flashinfer with DP, since there nothint to gain
+        #   - we are using marlin kjernels
         self.use_overlapped = (
             use_overlapped
-            and not (self.use_flashinfer_cutlass_kernels and self.dp_size > 1)
+            and not (
+                # TODO(wentao): find the root cause and remove this condition
+                self.enable_eplb
+                or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
+                or self.use_marlin_kernels
+            )
             and self._shared_experts is not None
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
new file mode 100644
index 000000000000..ce56887f1c26
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -0,0 +1,578 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import torch
+import torch.nn.functional as F
+
+import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe.config import (
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+    biased_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
+    FusedMoEMethodBase,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEActivationFormat,
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize,
+)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+if current_platform.is_cuda_alike():
+    from .fused_batched_moe import BatchedTritonExperts
+    from .fused_moe import TritonExperts, fused_experts
+else:
+    fused_experts = None  # type: ignore
+
+if current_platform.is_tpu():
+    from .moe_pallas import fused_moe as fused_moe_pallas
+else:
+    fused_moe_pallas = None  # type: ignore
+
+logger = init_logger(__name__)
+
+
+@CustomOp.register("unquantized_fused_moe")
+class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+    """MoE method without quantization."""
+
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__(moe)
+
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        if self.rocm_aiter_moe_enabled:
+            from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
+
+            self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
+        else:
+            self.rocm_aiter_fused_experts = None  # type: ignore
+
+        # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
+        self.flashinfer_cutlass_moe_enabled = (
+            has_flashinfer_cutlass_fused_moe()
+            and envs.VLLM_USE_FLASHINFER_MOE_FP16
+            and self.moe.moe_parallel_config.use_ep
+            and self.moe.moe_parallel_config.dp_size == 1
+            and current_platform.get_device_capability()[0] >= 9
+        )
+        if self.flashinfer_cutlass_moe_enabled:
+            logger.info_once(
+                "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod"
+            )
+            from functools import partial
+
+            from .flashinfer_cutlass_moe import flashinfer_cutlass_moe
+
+            self.flashinfer_cutlass_moe = partial(
+                flashinfer_cutlass_moe,
+                quant_config=FUSED_MOE_UNQUANTIZED_CONFIG,
+                tp_rank=self.moe.moe_parallel_config.tp_rank,
+                tp_size=self.moe.moe_parallel_config.tp_size,
+                ep_rank=self.moe.moe_parallel_config.ep_rank,
+                ep_size=self.moe.moe_parallel_config.ep_size,
+            )
+        else:
+            if (
+                self.moe.moe_parallel_config.use_ep
+                and self.moe.moe_parallel_config.dp_size == 1
+            ):
+                logger.info_once(
+                    "FlashInfer CUTLASS MoE is available for EP"
+                    " but not enabled, consider setting"
+                    " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
+                    scope="local",
+                )
+            elif self.moe.moe_parallel_config.dp_size > 1:
+                logger.info_once(
+                    "FlashInfer CUTLASS MoE is currently not available for DP.",
+                    scope="local",
+                )
+            self.flashinfer_cutlass_moe = None  # type: ignore
+
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
+    @property
+    def allow_inplace(self) -> bool:
+        return True
+
+    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+        if self.rocm_aiter_moe_enabled:
+            return None
+        else:
+            return super().maybe_make_prepare_finalize()
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        assert self.moe_quant_config is not None
+        if (
+            prepare_finalize.activation_format
+            == FusedMoEActivationFormat.BatchedExperts
+        ):
+            logger.debug("BatchedTritonExperts %s", self.moe)
+            return BatchedTritonExperts(
+                max_num_tokens=self.moe.max_num_tokens,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                quant_config=self.moe_quant_config,
+            )
+        else:
+            logger.debug("TritonExperts %s", self.moe)
+            return TritonExperts(self.moe_quant_config)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if self.moe.is_act_and_mul:
+            w13_up_dim = 2 * intermediate_size_per_partition
+        else:
+            w13_up_dim = intermediate_size_per_partition
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w13_up_dim,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        if self.moe.has_bias:
+            w13_bias = torch.nn.Parameter(
+                torch.zeros(num_experts, w13_up_dim, dtype=params_dtype),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_bias", w13_bias)
+            set_weight_attrs(w13_bias, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+        if self.moe.has_bias:
+            w2_bias = torch.nn.Parameter(
+                torch.zeros(num_experts, hidden_size, dtype=params_dtype),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_bias", w2_bias)
+            set_weight_attrs(w2_bias, extra_weight_attrs)
+
+    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        # Pad the weight tensor. This is an optimization on ROCm platform, which
+        # can benefit from tensors located far enough from one another in memory
+        if (
+            envs.VLLM_ROCM_MOE_PADDING
+            and current_platform.is_rocm()
+            and weight.stride(-1) == 1
+            and (weight.stride(-2) * weight.element_size()) % 512 == 0
+        ):
+            num_pad = 256 // weight.element_size()
+            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+            torch.cuda.empty_cache()
+
+        return weight
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+        # Padding the weight for better performance on ROCm
+        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
+        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
+
+        if self.rocm_aiter_moe_enabled:
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data
+            )
+
+            layer.w13_weight.data = shuffled_w13
+            layer.w2_weight.data = shuffled_w2
+
+        if self.flashinfer_cutlass_moe_enabled:
+            # Swap halves to arrange as [w3; w1] (kernel expectation)
+            w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1)
+            w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
+            layer.w13_weight.data = w13_weight_swapped.contiguous()
+
+        if current_platform.is_xpu():
+            import intel_extension_for_pytorch as ipex
+
+            ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
+            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+                layer.w13_weight,
+                layer.w2_weight,
+                use_prepack=True,
+                experts_start_id=ep_rank_start,
+            )
+        elif current_platform.is_cpu():
+            from vllm.model_executor.layers.fused_moe import cpu_fused_moe
+
+            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+                from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
+
+                dtype_w13 = layer.w13_weight.dtype
+                _, n_w13, k_w13 = layer.w13_weight.size()
+                dtype_w2 = layer.w2_weight.dtype
+                _, n_w2, k_w2 = layer.w2_weight.size()
+                if (
+                    envs.VLLM_CPU_SGL_KERNEL
+                    and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
+                    and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)
+                ):
+                    packed_w13_weight = torch.ops._C.convert_weight_packed(
+                        layer.w13_weight
+                    )
+                    assert packed_w13_weight.size() == layer.w13_weight.size()
+                    layer.w13_weight.copy_(packed_w13_weight)
+                    del packed_w13_weight
+                    packed_w2_weight = torch.ops._C.convert_weight_packed(
+                        layer.w2_weight
+                    )
+                    assert packed_w2_weight.size() == layer.w2_weight.size()
+                    layer.w2_weight.copy_(packed_w2_weight)
+                    layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
+                else:
+                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
+            else:
+                layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if enable_eplb:
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+
+        return self.forward(
+            x=x,
+            layer=layer,
+            router_logits=router_logits,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            enable_eplb=enable_eplb,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+        )
+
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        if self.moe.has_bias:
+            return biased_moe_quant_config(
+                layer.w13_bias,
+                layer.w2_bias,
+            )
+        else:
+            return FUSED_MOE_UNQUANTIZED_CONFIG
+
+    def forward_cuda(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        zero_expert_num = getattr(layer, "zero_expert_num", 0)
+        zero_expert_type = getattr(layer, "zero_expert_type", None)
+
+        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            global_num_experts=global_num_experts,
+            zero_expert_num=zero_expert_num,
+            zero_expert_type=zero_expert_type,
+            num_fused_shared_experts=layer.num_fused_shared_experts,
+        )
+
+        if self.rocm_aiter_moe_enabled:
+            result = self.rocm_aiter_fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                expert_map=expert_map,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        elif self.flashinfer_cutlass_moe_enabled:
+            return self.flashinfer_cutlass_moe(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        else:
+            result = fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                quant_config=self.moe_quant_config,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+            )
+
+        if zero_expert_num != 0 and zero_expert_type is not None:
+            assert not isinstance(result, tuple), (
+                "Shared + zero experts are mutually exclusive not yet supported"
+            )
+            return result, zero_expert_result
+        else:
+            return result
+
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if (
+            enable_eplb is not False
+            or expert_load_view is not None
+            or logical_to_physical_map is not None
+            or logical_replica_count is not None
+        ):
+            raise NotImplementedError("Expert load balancing is not supported for CPU.")
+        return layer.cpu_fused_moe(
+            layer,
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            global_num_experts,
+            expert_map,
+            custom_routing_function,
+            scoring_func,
+            routed_scaling_factor,
+            e_score_correction_bias,
+            apply_router_weight_on_input,
+            activation,
+        )
+
+    def forward_xpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if (
+            enable_eplb is not False
+            or expert_load_view is not None
+            or logical_to_physical_map is not None
+            or logical_replica_count is not None
+        ):
+            raise NotImplementedError("Expert load balancing is not supported for XPU.")
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function=custom_routing_function,
+        )
+
+    def forward_tpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        assert not use_grouped_topk
+        assert num_expert_group is None
+        assert topk_group is None
+        assert custom_routing_function is None
+        assert apply_router_weight_on_input is False
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax scoring function is supported for TPU."
+            )
+        if e_score_correction_bias is not None:
+            raise NotImplementedError(
+                "Expert score correction bias is not supported for TPU."
+            )
+        assert activation == "silu", f"{activation} is not supported for TPU."
+        assert routed_scaling_factor == 1.0, (
+            f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU."
+        )
+        if (
+            enable_eplb is not False
+            or expert_load_view is not None
+            or logical_to_physical_map is not None
+            or logical_replica_count is not None
+        ):
+            raise NotImplementedError("Expert load balancing is not supported for TPU.")
+        return fused_moe_pallas(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk=top_k,
+            gating_output=router_logits,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            renormalize=renormalize,
+        )
+
+    if current_platform.is_tpu():
+        forward_native = forward_tpu
+    elif current_platform.is_cpu():
+        forward_native = forward_cpu
+    elif current_platform.is_xpu():
+        forward_native = forward_xpu
+    else:
+        forward_native = forward_cuda
diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py
index 26458f2e3c4d..2e7500bac718 100644
--- a/vllm/model_executor/layers/kda.py
+++ b/vllm/model_executor/layers/kda.py
@@ -44,7 +44,6 @@ def kda_attention(
     k_proj_states: torch.Tensor,
     v_proj_states: torch.Tensor,
     g1: torch.Tensor,
-    g2: torch.Tensor,
     beta: torch.Tensor,
     core_attn_out: torch.Tensor,
     layer_name: str,
@@ -56,7 +55,6 @@ def kda_attention(
         k_proj_states=k_proj_states,
         v_proj_states=v_proj_states,
         g1=g1,
-        g2=g2,
         beta=beta,
         core_attn_out=core_attn_out,
     )
@@ -67,7 +65,6 @@ def kda_attention_fake(
     k_proj_states: torch.Tensor,
     v_proj_states: torch.Tensor,
     g1: torch.Tensor,
-    g2: torch.Tensor,
     beta: torch.Tensor,
     core_attn_out: torch.Tensor,
     layer_name: str,
@@ -284,7 +281,6 @@ def forward(
             k,
             v,
             g1,
-            g2,
             beta,
             core_attn_out,
             self.prefix,
@@ -299,7 +295,6 @@ def _forward(
         k_proj_states: torch.Tensor,
         v_proj_states: torch.Tensor,
         g1: torch.Tensor,
-        g2: torch.Tensor,
         beta: torch.Tensor,
         core_attn_out: torch.Tensor,
     ) -> None:
@@ -316,8 +311,15 @@ def _forward(
         has_initial_state = attn_metadata.has_initial_state
         non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
         non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
+        num_actual_tokens = attn_metadata.num_actual_tokens
         constant_caches = self.kv_cache[forward_context.virtual_engine]
 
+        q_proj_states = q_proj_states[:num_actual_tokens]
+        k_proj_states = k_proj_states[:num_actual_tokens]
+        v_proj_states = v_proj_states[:num_actual_tokens]
+        g1 = g1[:num_actual_tokens]
+        beta = beta[:num_actual_tokens]
+
         (conv_state_q, conv_state_k, conv_state_v, recurrent_state) = constant_caches
         # deal with strides
         conv_state_q = conv_state_q.transpose(-1, -2)
@@ -372,7 +374,7 @@ def _forward(
             ).transpose(0, 1)
         else:
             decode_conv_indices = non_spec_state_indices_tensor[
-                : attn_metadata.num_decodes
+                : attn_metadata.num_actual_tokens
             ]
             q = causal_conv1d_update(
                 q_proj_states,
@@ -438,8 +440,9 @@ def _forward(
                 beta=beta,
                 initial_state=recurrent_state,
                 use_qk_l2norm_in_kernel=True,
-                cu_seqlens=non_spec_query_start_loc,
+                cu_seqlens=non_spec_query_start_loc[: attn_metadata.num_decodes + 1],
                 ssm_state_indices=non_spec_state_indices_tensor,
             )
-        assert core_attn_out_non_spec.shape == core_attn_out.shape
-        core_attn_out[:] = core_attn_out_non_spec
+        core_attn_out[0, :num_actual_tokens] = core_attn_out_non_spec[
+            0, :num_actual_tokens
+        ]
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index a883ac81f41e..8cc374ac9155 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -6,18 +6,13 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.batch_invariant import (
     rms_norm_batch_invariant,
     vllm_is_batch_invariant,
 )
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
-
-
-def is_rocm_aiter_rmsnorm_enabled() -> bool:
-    return envs.VLLM_ROCM_USE_AITER_RMSNORM and envs.VLLM_ROCM_USE_AITER
 
 
 def rms_norm(
@@ -58,80 +53,34 @@ def fused_add_rms_norm(
     return x, residual
 
 
-def rocm_aiter_rms_norm_impl(
-    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
+def poly_norm(
+    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, variance_epsilon: float
 ) -> torch.Tensor:
-    import aiter as rocm_aiter
-
-    if x.dim() > 2:
-        x_original_shape = x.shape
-        x = x.reshape(-1, x_original_shape[-1])
-        x = rocm_aiter.rms_norm(x, weight, variance_epsilon)
-        return x.reshape(x_original_shape)
-
-    return rocm_aiter.rms_norm(x, weight, variance_epsilon)
-
+    from vllm import _custom_ops as ops
 
-def rocm_aiter_rmsnorm2d_fwd_with_add_impl(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    import aiter as rocm_aiter
-
-    residual_out = torch.empty_like(residual)
-    output = torch.empty_like(x)
-    rocm_aiter.rmsnorm2d_fwd_with_add(
-        output,  # output
-        x,  # input
-        residual,  # residual input
-        residual_out,  # residual output
+    out = torch.empty_like(x)
+    ops.poly_norm(
+        out,
+        x,
         weight,
+        bias,
         variance_epsilon,
     )
-    return output, residual_out
-
-
-def rocm_aiter_rms_norm_fake(
-    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-) -> torch.Tensor:
-    return torch.empty_like(x)
-
-
-def rocm_aiter_rmsnorm2d_fwd_with_add_fake(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    return torch.empty_like(x), torch.empty_like(residual)
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_rms_norm",
-        op_func=rocm_aiter_rms_norm_impl,
-        fake_impl=rocm_aiter_rms_norm_fake,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
-        op_func=rocm_aiter_rmsnorm2d_fwd_with_add_impl,
-        fake_impl=rocm_aiter_rmsnorm2d_fwd_with_add_fake,
-    )
+    return out
 
 
-def dispatch_rocm_rmsnorm_func(with_fused_add: bool, dtype: torch.dtype):
-    use_aiter = is_rocm_aiter_rmsnorm_enabled() and dtype in [
+def dispatch_rocm_rmsnorm_func(
+    with_fused_add: bool, dtype: torch.dtype, use_aiter: bool = False
+):
+    use_aiter = use_aiter and dtype in [
         torch.float16,
         torch.bfloat16,
     ]
 
     if use_aiter and with_fused_add:
-        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
+        return rocm_aiter_ops.rms_norm2d_with_add
     if use_aiter:
-        return torch.ops.vllm.rocm_aiter_rms_norm
+        return rocm_aiter_ops.rms_norm
 
     # fall back to CUDA implementation
     if with_fused_add:
@@ -169,11 +118,14 @@ def __init__(
             self.weight = nn.Parameter(self.weight)
 
         if current_platform.is_rocm():
+            aiter_rmsnorm_enabled = rocm_aiter_ops.is_rmsnorm_enabled()
             self.rocm_norm_func = dispatch_rocm_rmsnorm_func(
-                with_fused_add=False, dtype=weight_dtype
+                with_fused_add=False,
+                dtype=weight_dtype,
+                use_aiter=aiter_rmsnorm_enabled,
             )
             self.rocm_norm_func_with_add = dispatch_rocm_rmsnorm_func(
-                with_fused_add=True, dtype=weight_dtype
+                with_fused_add=True, dtype=weight_dtype, use_aiter=aiter_rmsnorm_enabled
             )
 
     @staticmethod
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index e5a5c9dd6f71..661c884627b0 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -245,7 +245,7 @@ def _chunk_scan_fwd_kernel(
             )
             if not HAS_INITSTATES and (seq_idx != seq_idx_prev):
                 prev_states = tl.zeros(
-                    (BLOCK_SIZE_DSTATE, BLOCK_SIZE_K), dtype=C_ptr.dtype.element_ty
+                    (BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=C_ptr.dtype.element_ty
                 )
             else:
                 prev_states = tl.load(
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index b92fb8d266b7..bb42b10f8718 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -3,8 +3,11 @@
 
 from typing import Literal, get_args
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 
+logger = init_logger(__name__)
+
 QuantizationMethods = Literal[
     "awq",
     "deepspeedfp",
@@ -70,15 +73,20 @@ def register_quantization_config(quantization: str):
 
     def _wrapper(quant_config_cls):
         if quantization in QUANTIZATION_METHODS:
-            raise ValueError(
-                f"The quantization method `{quantization}` is already exists."
+            logger.warning(
+                "The quantization method '%s' already exists and will be "
+                "overwritten by the quantization config %s.",
+                quantization,
+                quant_config_cls,
             )
+        else:
+            QUANTIZATION_METHODS.append(quantization)
+
         if not issubclass(quant_config_cls, QuantizationConfig):
             raise ValueError(
                 "The quantization config must be a subclass of `QuantizationConfig`."
             )
         _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
-        QUANTIZATION_METHODS.append(quantization)
         return quant_config_cls
 
     return _wrapper
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 3e1f87b59a34..3f6ea68072b4 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -424,6 +424,7 @@ def __init__(
         if self.quant_config.weight_bits != 4:
             raise ValueError("AWQMoEMethod only supports 4bit now.")
         self.quant_type = scalar_types.uint4
+        self.use_marlin = True
 
     def create_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
index e69de29bb2d1..6655f8913623 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index d95d49eddfe3..06ee96d55419 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -12,6 +12,7 @@
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
@@ -34,7 +35,11 @@
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     is_valid_flashinfer_cutlass_fused_moe,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    BatchedMarlinExperts,
+    MarlinExperts,
+    fused_marlin_moe,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS,
     WNA16_SUPPORTED_TYPES_MAP,
@@ -97,9 +102,6 @@ class GPTQMarlinState(Enum):
 
 
 class CompressedTensorsMoEMethod(FusedMoEMethodBase):
-    def __init_(self, moe: FusedMoEConfig):
-        super().__init__(moe)
-
     @staticmethod
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
@@ -582,11 +584,8 @@ def __init__(
         # Disable marlin for rocm
         if current_platform.is_rocm():
             self.use_marlin = False
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            is_rocm_aiter_moe_enabled,
-        )
 
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
         # cutlass path
         self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100(
@@ -829,12 +828,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # Property to determine if AITER is used
         if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
-                shuffle_weights,
-            )
-
             # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = shuffle_weights(
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                 layer.w13_weight.data, layer.w2_weight.data
             )
 
@@ -972,10 +967,18 @@ def select_gemm_impl(
                 max_num_tokens=max_num_tokens_per_rank,
                 num_dispatchers=prepare_finalize.num_dispatchers(),
                 quant_config=self.moe_quant_config,
+                allow_deep_gemm=(
+                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+                ),
             )
         else:
             logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
-            return TritonOrDeepGemmExperts(self.moe_quant_config, allow_deep_gemm=True)
+            return TritonOrDeepGemmExperts(
+                self.moe_quant_config,
+                allow_deep_gemm=(
+                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+                ),
+            )
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -1020,9 +1023,10 @@ def apply(
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW8A8Fp8MoEMethod` yet."
-            )
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+            assert isinstance(layer, FusedMoE)
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -1038,6 +1042,11 @@ def apply(
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
             num_fused_shared_experts=layer.num_fused_shared_experts,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
         )
 
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
@@ -1146,6 +1155,10 @@ def apply(
                 quant_config=self.moe_quant_config,
             )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
 
 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     def __init__(
@@ -1343,6 +1356,7 @@ def __init__(
                 f"{WNA16_SUPPORTED_BITS}",
             )
         self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
+        self.use_marlin = True
 
     def create_weights(
         self,
@@ -1568,7 +1582,51 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        return None
+        if self.num_bits != 4:
+            return None
+        return int4_w4a16_moe_quant_config(
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            w1_zp=None,
+            w2_zp=None,
+            block_shape=[0, self.group_size],
+        )
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        assert self.num_bits == 4, "only supporting w4"
+        layer.w13_weight = layer.w13_weight_packed
+        layer.w2_weight = layer.w2_weight_packed
+        assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
+        assert self.moe_quant_config is not None
+        if (
+            prepare_finalize.activation_format
+            == mk.FusedMoEActivationFormat.BatchedExperts
+        ):
+            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+            assert max_num_tokens_per_rank is not None
+            return BatchedMarlinExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+                quant_config=self.moe_quant_config,
+                w13_g_idx=layer.w13_weight_g_idx,
+                w2_g_idx=layer.w2_weight_g_idx,
+                w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
+        else:
+            return MarlinExperts(
+                quant_config=self.moe_quant_config,
+                w13_g_idx=layer.w13_weight_g_idx,
+                w2_g_idx=layer.w2_weight_g_idx,
+                w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
+                w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
+                is_k_full=self.is_k_full,
+            )
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index ee431c9148b8..ee99572f5f49 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -7,12 +7,12 @@
 from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from torch.nn import Parameter
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme,
 )
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     W8A8BlockFp8LinearOp,
-    check_aiter_fp8_linear_support,
     create_fp8_input_scale,
     create_fp8_scale_parameter,
     create_fp8_weight_parameter,
@@ -61,7 +61,7 @@ def __init__(self, weight_quant: QuantizationArgs, is_static_input_scheme: bool)
             )
 
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
-        self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
 
         if self.weight_block_size is not None:
             assert not self.is_static_input_scheme
@@ -173,7 +173,7 @@ def process_weights_after_loading(self, layer) -> None:
             layer.input_scale = None
 
         if self.strategy == QuantizationStrategy.BLOCK:
-            maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported)
+            maybe_post_process_fp8_weight_block(layer)
 
     def apply_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ce40645782e5..0479bec33840 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -3,6 +3,7 @@
 
 from collections.abc import Callable
 from enum import Enum
+from functools import partial
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
@@ -12,6 +13,7 @@
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
@@ -27,6 +29,7 @@
 )
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
+    RoutingMethodType,
     fp8_w8a8_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
@@ -41,7 +44,6 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     FlashinferMoeBackend,
@@ -56,15 +58,13 @@
 )
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     W8A8BlockFp8LinearOp,
-    check_aiter_fp8_linear_support,
     create_fp8_input_scale,
     create_fp8_scale_parameter,
     create_fp8_weight_parameter,
-    expert_weight_is_col_major,
+    deepgemm_post_process_fp8_weight_block,
     maybe_post_process_fp8_weight_block,
     process_fp8_weight_block_strategy,
     process_fp8_weight_tensor_strategy,
-    requant_weight_ue8m0_inplace,
     validate_fp8_block_shape,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -94,11 +94,8 @@
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
-    fp8_gemm_nt,
-    get_col_major_tma_aligned_tensor,
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
-    should_use_deepgemm_for_fp8_linear,
 )
 from vllm.utils.flashinfer import has_flashinfer_moe
 from vllm.utils.import_utils import has_deep_gemm
@@ -126,10 +123,13 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
     """
-    # prefer FlashInfer backends when available and enabled on supported GPUs
+    # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
     if (
         current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
+        and (
+            current_platform.is_device_capability(100)
+            or current_platform.is_device_capability(90)
+        )
         and envs.VLLM_USE_FLASHINFER_MOE_FP8
         and has_flashinfer_moe()
     ):
@@ -138,14 +138,14 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
             logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
             return Fp8MoeBackend.FLASHINFER_TRTLLM
         else:
-            if block_quant:
+            if block_quant and current_platform.is_device_capability(100):
                 raise ValueError(
                     "FlashInfer FP8 MoE throughput backend does not "
                     "support block quantization. Please use "
                     "VLLM_FLASHINFER_MOE_BACKEND=latency "
                     "instead."
                 )
-            logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM100")
+            logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM90/SM100")
             return Fp8MoeBackend.FLASHINFER_CUTLASS
 
     # weight-only path for older GPUs without native FP8
@@ -160,7 +160,7 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
         return Fp8MoeBackend.MARLIN
 
     # deepGEMM on supported platforms with block-quantized weights
-    if envs.VLLM_USE_DEEP_GEMM and block_quant:
+    if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
         if not has_deep_gemm():
             logger.warning_once("DeepGEMM backend requested but not available.")
         elif is_deep_gemm_supported():
@@ -369,7 +369,7 @@ def __init__(self, quant_config: Fp8Config):
         if vllm_is_batch_invariant():
             self.use_marlin = False
 
-        self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
+        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
         self.use_deep_gemm = is_deep_gemm_supported()
 
         self.weight_block_size = self.quant_config.weight_block_size
@@ -542,7 +542,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             return
 
         if self.block_quant:
-            maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported)
+            maybe_post_process_fp8_weight_block(layer)
 
     def apply(
         self,
@@ -553,83 +553,19 @@ def apply(
         # if batch invariant mode is enabled, prefer DeepGEMM FP8 path
         # we will use BF16 dequant when DeepGEMM is not supported.
         if vllm_is_batch_invariant():
-            # Call is_deep_gemm_supported() ahead of time for torch.compile
-            # dynamo has trouble tracing through
-            if self.block_quant and should_use_deepgemm_for_fp8_linear(
-                torch.bfloat16, layer.weight, self.use_deep_gemm
-            ):
-                # use group quant consistent with block size across K
-                assert self.act_q_group_shape is not None
-                q_input, input_scale = QuantFP8(
-                    False,
-                    self.act_q_group_shape,
-                    column_major_scales=True,
-                )(x)
-
-                output_2d = torch.empty(
-                    (q_input.shape[0], layer.weight.shape[0]),
-                    dtype=torch.bfloat16,
-                    device=q_input.device,
-                )
-                fp8_gemm_nt(
-                    (q_input, input_scale),
-                    (layer.weight, layer.weight_scale),
-                    output_2d,
-                )
-                if bias is not None:
-                    output_2d = output_2d + bias
-                return output_2d
-
-            # Dequantize FP8 weights to BF16
-            weight_fp8 = layer.weight.to(torch.bfloat16)
-            weight_scale = layer.weight_scale.to(torch.bfloat16)
-
-            # Handle different quantization granularities
             if self.block_quant:
-                # Block-wise quantization:
-                # - Weight is NOT transposed, shape is [N, K] (output_size, input_size)
-                # - Scale has shape [num_blocks_k, num_blocks_n] (TRANSPOSED!)
                 assert self.weight_block_size is not None
-                block_n, block_k = self.weight_block_size  # Note: order is [N, K]
-
-                N, K = weight_fp8.shape
-
-                # determine expected number of blocks along N and K
-                num_blocks_n = (N + block_n - 1) // block_n
-                num_blocks_k = (K + block_k - 1) // block_k
-
-                # scale layout may be [num_blocks_n, num_blocks_k]
-                # or [num_blocks_k, num_blocks_n] depending on backend
-                if weight_scale.dim() != 2:
-                    raise RuntimeError(
-                        f"FP8 block scale must be 2D, got {tuple(weight_scale.shape)}"
-                    )
-
-                scale_rows, scale_cols = weight_scale.shape
-                if (scale_rows, scale_cols) == (num_blocks_k, num_blocks_n):
-                    if num_blocks_n == num_blocks_k:
-                        # ambiguous square case, warn and skip transpose
-                        logger.warning(
-                            "Batch-invariant FP8: square block-scale %dx%d; "
-                            "skipping transpose to avoid misorientation.",
-                            scale_rows,
-                            scale_cols,
-                        )
-                    else:
-                        # clear KN -> transpose to NK
-                        weight_scale = weight_scale.t()
-
-                # Expand scale to match weight dimensions
-                # scale_expanded should have shape [N, K]
-                scale_expanded = weight_scale.repeat_interleave(
-                    block_n, dim=0
-                ).repeat_interleave(block_k, dim=1)
-                # Trim to exact weight size (in case of padding)
-                scale_expanded = scale_expanded[:N, :K]
-                weight_bf16 = weight_fp8 * scale_expanded
+                return self.w8a8_block_fp8_linear.apply(
+                    input=x,
+                    weight=layer.weight,
+                    weight_scale=layer.weight_scale,
+                    input_scale=layer.input_scale,
+                    bias=bias,
+                )
             else:
-                # Per-tensor quantization: weight IS transposed to [K, N]
-                # scale should be scalar or [1] or per-output-channel [N]
+                # per-tensor/channel: dequant to BF16 and run GEMM
+                weight_fp8 = layer.weight.to(torch.bfloat16)
+                weight_scale = layer.weight_scale.to(torch.bfloat16)
                 if weight_scale.numel() == 1:
                     # Per-tensor: simple scalar multiplication
                     weight_bf16 = weight_fp8 * weight_scale
@@ -648,16 +584,7 @@ def apply(
                     else:
                         # Fallback
                         weight_bf16 = weight_fp8 * weight_scale
-
-            # For block quant, weight is [N, K], for per-tensor it's [K, N]
-            # F.linear expects weight to be [N, K], so:
-            if self.block_quant:
-                # Already in correct shape [N, K]
-                output = torch.nn.functional.linear(x, weight_bf16, bias)
-            else:
-                # Need to transpose back: [K, N] -> [N, K]
-                output = torch.nn.functional.linear(x, weight_bf16.t(), bias)
-            return output
+                return torch.nn.functional.linear(x, weight_bf16.t(), bias)
 
         if self.use_marlin:
             return apply_fp8_marlin_linear(
@@ -718,6 +645,16 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
             self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
         elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
             self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
+            if self.block_quant:
+                assert self.weight_block_size == [128, 128], (
+                    f"Only support weight_block_size == [128, 128], "
+                    f"got {self.weight_block_size}"
+                )
+            self.flashinfer_moe_fn = partial(
+                flashinfer_cutlass_moe_fp8,
+                moe=self.moe,
+                use_deepseek_fp8_block_scale=self.block_quant,
+            )
 
         self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
         self.allow_cutlass_block_scaled_grouped_gemm = (
@@ -869,12 +806,8 @@ def create_weights(
 
     def process_weights_after_loading(self, layer: Module) -> None:
         # Lazy import to avoid importing triton too early.
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            is_rocm_aiter_moe_enabled,
-            shuffle_weights,
-        )
 
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
@@ -916,7 +849,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             )
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
-                shuffled_w13, shuffled_w2 = shuffle_weights(
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight.data, layer.w2_weight.data
                 )
 
@@ -925,15 +858,31 @@ def process_weights_after_loading(self, layer: Module) -> None:
 
             # DeepGemm scales need to be transposed and aligned. We try to do
             # it ahead of time for performance reasons.
-            if self.allow_deep_gemm and not is_deep_gemm_e8m0_used():
-                if expert_weight_is_col_major(layer.w13_weight_scale_inv):
-                    layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                        layer.w13_weight_scale_inv
+            if self.allow_deep_gemm:
+                dg_w13_weight, dg_w13_weight_scale_inv = (
+                    deepgemm_post_process_fp8_weight_block(
+                        wq=layer.w13_weight.data,
+                        ws=layer.w13_weight_scale_inv.data,
+                        quant_block_shape=tuple(layer.weight_block_size),
+                        use_e8m0=is_deep_gemm_e8m0_used(),
                     )
-                if expert_weight_is_col_major(layer.w2_weight_scale_inv):
-                    layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                        layer.w2_weight_scale_inv
+                )
+                dg_w2_weight, dg_w2_weight_scale_inv = (
+                    deepgemm_post_process_fp8_weight_block(
+                        wq=layer.w2_weight.data,
+                        ws=layer.w2_weight_scale_inv.data,
+                        quant_block_shape=tuple(layer.weight_block_size),
+                        use_e8m0=is_deep_gemm_e8m0_used(),
                     )
+                )
+                layer.w13_weight = Parameter(dg_w13_weight, requires_grad=False)
+                layer.w13_weight_scale_inv = Parameter(
+                    dg_w13_weight_scale_inv, requires_grad=False
+                )
+                layer.w2_weight = Parameter(dg_w2_weight, requires_grad=False)
+                layer.w2_weight_scale_inv = Parameter(
+                    dg_w2_weight_scale_inv, requires_grad=False
+                )
 
         # If checkpoint is fp16, quantize in place.
         elif not self.quant_config.is_checkpoint_fp8_serialized:
@@ -962,7 +911,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
-                shuffled_w13, shuffled_w2 = shuffle_weights(
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight, layer.w2_weight
                 )
 
@@ -1042,7 +991,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     start += shard_size
 
             if self.rocm_aiter_moe_enabled:
-                shuffled_w13, shuffled_w2 = shuffle_weights(
+                shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                     layer.w13_weight, layer.w2_weight
                 )
 
@@ -1069,31 +1018,6 @@ def process_weights_after_loading(self, layer: Module) -> None:
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-        if is_deep_gemm_e8m0_used() and self.block_quant:
-            assert layer.weight_block_size is not None
-            # Re-quantise the expert weights so their scales are UE8M0.
-            block_sz = tuple(layer.weight_block_size)
-            requant_weight_ue8m0_inplace(
-                layer.w13_weight.data,
-                layer.w13_weight_scale_inv.data,
-                block_sz,
-            )
-            requant_weight_ue8m0_inplace(
-                layer.w2_weight.data,
-                layer.w2_weight_scale_inv.data,
-                block_sz,
-            )
-
-            # Ensure column-major TMA alignment expected by DeepGEMM.
-            if expert_weight_is_col_major(layer.w13_weight_scale_inv):
-                layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                    layer.w13_weight_scale_inv
-                )
-            if expert_weight_is_col_major(layer.w2_weight_scale_inv):
-                layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor(
-                    layer.w2_weight_scale_inv
-                )
-
     def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
             self.rocm_aiter_moe_enabled
@@ -1102,8 +1026,15 @@ def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         ):
             return None
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            if self.block_quant:
+                assert self.weight_block_size == [128, 128], (
+                    f"Only support weight_block_size == [128, 128], "
+                    f"got {self.weight_block_size}"
+                )
+            # Wire block-scale flag through prepare/finalize when using CUTLASS
             prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-                self.moe
+                self.moe,
+                use_deepseek_fp8_block_scale=self.block_quant,
             )
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
@@ -1116,7 +1047,8 @@ def select_gemm_impl(
         layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         from vllm.model_executor.layers.fused_moe import (
-            BatchedTritonOrDeepGemmExperts,
+            BatchedDeepGemmExperts,
+            BatchedTritonExperts,
             TritonOrDeepGemmExperts,
         )
 
@@ -1132,24 +1064,30 @@ def select_gemm_impl(
         ):
             max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
             assert max_num_tokens_per_rank is not None
+
+            experts_impl = (
+                BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts
+            )
             logger.debug(
-                "BatchedTritonOrDeepGemmExperts(%s): "
-                "max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
+                "%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
+                experts_impl.__name__,
                 self.__class__.__name__,
                 max_num_tokens_per_rank,
                 self.weight_block_size,
                 False,
             )
-            return BatchedTritonOrDeepGemmExperts(
+            return experts_impl(
                 max_num_tokens=max_num_tokens_per_rank,
                 num_dispatchers=prepare_finalize.num_dispatchers(),
                 quant_config=self.moe_quant_config,
-                allow_deep_gemm=self.allow_deep_gemm,
             )
+
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            # Select GEMM experts with block-scale when weights are block-quantized
             experts = select_cutlass_fp8_gemm_impl(
                 self.moe,
                 self.moe_quant_config,
+                use_deepseek_fp8_block_scale=self.block_quant,
             )
             logger.debug_once("Using %s", experts.__class__.__name__)
             return experts
@@ -1226,22 +1164,20 @@ def apply(
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
-            assert scoring_func == "sigmoid", (
-                f"Expected 'sigmoid' scoring func but got {scoring_func}"
-            )
+
             if self.block_quant:
                 import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe  # noqa: E501, F401
 
-                assert (
-                    renormalize and use_grouped_topk and custom_routing_function is None
-                )
                 e_score_correction_bias = (
                     e_score_correction_bias.to(x.dtype)
                     if e_score_correction_bias is not None
                     else None
                 )
+                routing_method_type = layer.routing_method_type
                 return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
-                    routing_logits=router_logits.to(torch.float32),
+                    routing_logits=router_logits.to(torch.float32)
+                    if routing_method_type == RoutingMethodType.DeepSeekV3
+                    else router_logits,
                     routing_bias=e_score_correction_bias,
                     x=x,
                     w13_weight=layer.w13_weight,
@@ -1256,6 +1192,7 @@ def apply(
                     expert_offset=layer.ep_rank * layer.local_num_experts,
                     local_num_experts=layer.local_num_experts,
                     block_shape=self.weight_block_size,
+                    routing_method_type=routing_method_type,
                     routed_scaling=routed_scaling_factor,
                 )
             else:
@@ -1337,16 +1274,17 @@ def apply(
                 workspace=layer.workspace,
             )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert not self.block_quant
-            assert not renormalize and custom_routing_function is not None
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
-            assert scoring_func == "sigmoid", (
-                f"Expected 'sigmoid' scoring func but got {scoring_func}"
-            )
-
-            result = flashinfer_cutlass_moe_fp8(
+            if not self.block_quant:
+                assert not renormalize and custom_routing_function is not None
+                assert scoring_func == "sigmoid", (
+                    f"Expected 'sigmoid' scoring func but got {scoring_func}"
+                )
+            # Delegate to CUTLASS FlashInfer path; function already bound with
+            # use_deepseek_fp8_block_scale for block-quant when applicable
+            result = self.flashinfer_moe_fn(
                 x,
                 layer,
                 topk_weights,
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index caabcd0ca0ee..42d7a67371ae 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
+from collections.abc import Callable, Mapping
+from types import MappingProxyType
 from typing import Any, Optional
 
 import gguf
@@ -26,7 +27,11 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -65,18 +70,70 @@ def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
-            if is_layer_skipped_gguf(prefix, self.unquantized_modules):
+            if is_layer_skipped_gguf(
+                prefix, self.unquantized_modules, self.packed_modules_mapping
+            ):
                 return UnquantizedLinearMethod()
             return GGUFLinearMethod(self)
         elif isinstance(layer, VocabParallelEmbedding):
+            if is_layer_skipped_gguf(
+                prefix, self.unquantized_modules, self.packed_modules_mapping
+            ):
+                return UnquantizedEmbeddingMethod()
             return GGUFEmbeddingMethod(self)
         elif isinstance(layer, FusedMoE):
             return GGUFMoEMethod(self, layer.moe_config)
         return None
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the vllm model structure
+
+        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to vllm model structure
+        """
+        if self.unquantized_modules is not None:
+            self.unquantized_modules = hf_to_vllm_mapper.apply_list(
+                self.unquantized_modules
+            )
+
+
+def is_layer_skipped_gguf(
+    prefix: str,
+    unquantized_modules: list[str],
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
+):
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    proj_name = prefix.split(".")[-1]
+    if proj_name in fused_mapping:
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in fused_mapping[proj_name]
+        ]
+
+        is_skipped = None
+        for shard_prefix in shard_prefixes:
+            is_shard_skipped = any(
+                shard_prefix in module_name for module_name in unquantized_modules
+            )
+
+            if is_skipped is None:
+                is_skipped = is_shard_skipped
+            elif is_shard_skipped != is_skipped:
+                raise ValueError(
+                    f"Detected some but not all shards of {prefix} "
+                    "are quantized. All shards of fused layers "
+                    "to have the same precision."
+                )
+    else:
+        is_skipped = any(module_name in prefix for module_name in unquantized_modules)
 
-def is_layer_skipped_gguf(prefix: str, unquantized_modules: list[str]):
-    return any(module_name in prefix for module_name in unquantized_modules)
+    assert is_skipped is not None
+    return is_skipped
 
 
 UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 42a569e7770c..68a122fd46c6 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -482,6 +482,7 @@ def __init__(
             self.quant_type = scalar_types.uint8b128
         else:
             raise ValueError("GPTQMarlinMoEMethod only supports int4 and int8 now.")
+        self.use_marlin = True
 
     def create_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index e0234191c62b..5ca9167faec8 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -52,6 +52,7 @@ def __init__(
         modules_to_not_convert: list[str] | None = None,
         desc_act: bool | None = None,
         lm_head_quantized: bool | None = None,
+        is_sym: bool | None = None,
     ) -> None:
         super().__init__()
         self.method = method
@@ -60,6 +61,7 @@ def __init__(
         self.modules_to_not_convert = modules_to_not_convert or []
         self.desc_act = desc_act
         self.lm_head_quantized = lm_head_quantized
+        self.is_sym = is_sym
         self.pack_factor = 32 // self.weight_bits
 
         if self.weight_bits not in [4]:
@@ -108,15 +110,25 @@ def from_config(cls, config: dict[str, Any]) -> "IPEXConfig":
             modules_to_not_convert = cls.get_from_keys_or(
                 config, ["modules_to_not_convert"], None
             )
+            is_sym = not cls.get_from_keys_or(config, ["zero_point"], default=False)
             return cls(
-                method, weight_bits, group_size, modules_to_not_convert, False, False
+                method,
+                weight_bits,
+                group_size,
+                modules_to_not_convert,
+                False,
+                False,
+                is_sym,
             )
         # otherwise for gptq
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
         desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
-        return cls(method, weight_bits, group_size, [], desc_act, lm_head_quantized)
+        is_sym = cls.get_from_keys_or(config, ["sym"], default=True)
+        return cls(
+            method, weight_bits, group_size, [], desc_act, lm_head_quantized, is_sym
+        )
 
     @classmethod
     def override_quantization_method(
@@ -180,6 +192,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # The float activation will be quantized (dynamic, per-token) to INT8.
         act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
 
+        assert isinstance(self.quant_config, IPEXConfig)
         qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
             weight_dtype=weight_dtype,
             lowp_mode=lowp_mode,
@@ -200,6 +213,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 bias=bias,
                 group_size=self.quant_config.group_size,
                 quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"],
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
             )
         )
 
@@ -250,6 +264,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # The float activation will be quantized (dynamic, per-token) to INT8.
         act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
 
+        assert isinstance(self.quant_config, IPEXConfig)
         qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
             weight_dtype=weight_dtype,
             lowp_mode=lowp_mode,
@@ -269,6 +284,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 bias=bias,
                 group_size=self.quant_config.group_size,
                 quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"],  # type: ignore
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
             )
         )
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index a19396a162bc..038a92c516ce 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -4,54 +4,14 @@
 
 import torch
 
-import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
 
 from .cutlass import CutlassScaledMMLinearKernel
 from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
 
 
-def rocm_aiter_gemm_w8a8_impl(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    bias: torch.Tensor | None = None,
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    from aiter import gemm_a8w8_CK
-
-    # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
-    # a to be [M, K]
-    # b to be [N, K]
-    # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
-    return gemm_a8w8_CK(A, B, As, Bs, bias, output_dtype)
-
-
-def rocm_aiter_gemm_w8a8_fake(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    As: torch.Tensor,
-    Bs: torch.Tensor,
-    bias: torch.Tensor | None = None,
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    m = A.shape[0]
-    n = B.shape[0]
-    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
-    return Y
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_gemm_w8a8",
-        op_func=rocm_aiter_gemm_w8a8_impl,
-        fake_impl=rocm_aiter_gemm_w8a8_fake,
-    )
-
-
 class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
     @classmethod
     def get_min_capability(cls) -> int:
@@ -75,7 +35,7 @@ def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
                 + "installed on ROCm.",
             )
         # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
-        if not (envs.VLLM_ROCM_USE_AITER_LINEAR and envs.VLLM_ROCM_USE_AITER):
+        if not (rocm_aiter_ops.is_linear_enabled()):
             return (
                 False,
                 "AiterScaledMMLinearKernel is disabled. "
@@ -157,6 +117,4 @@ def apply_weights(
         # a to be [M, K]
         # b to be [N, K]
         # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
-        return torch.ops.vllm.rocm_aiter_gemm_w8a8(
-            x_q, w_q.t(), x_s, w_s, bias, out_dtype
-        )
+        return rocm_aiter_ops.gemm_a8w8(x_q, w_q.t(), x_s, w_s, bias, out_dtype)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index e14753c60c48..476521813f46 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -15,6 +15,7 @@
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
+    RoutingMethodType,
     fp8_w8a8_moe_quant_config,
     nvfp4_moe_quant_config,
 )
@@ -354,12 +355,18 @@ def __init__(
 
         self.cutlass_fp8_supported = cutlass_fp8_supported()
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
-        if (
-            envs.VLLM_USE_FLASHINFER_MOE_FP8
-            and has_flashinfer_moe()
-            and self.moe.is_act_and_mul
-        ):
+        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
+            if (
+                self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+                and not self.moe.is_act_and_mul
+            ):
+                logger.info_once(
+                    "Non-gated MoE is not supported for min-latency mode,"
+                    "falling back to high-throughput mode"
+                )
+                self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
+
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
             )
@@ -557,10 +564,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             )
 
         if self.flashinfer_moe_backend is not None:
-            layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
-            register_moe_scaling_factors(layer)
+            if self.moe.is_act_and_mul:
+                layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
             if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
                 rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
+        register_moe_scaling_factors(layer)
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -570,13 +578,13 @@ def get_fused_moe_quant_config(
 
         return fp8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
-            g1_alphas=(layer.w13_weight_scale * layer.w13_input_scale).squeeze(),
+            g1_alphas=layer.output1_scales_gate_scalar.squeeze(),
             w2_scale=layer.w2_weight_scale,
-            g2_alphas=(layer.w2_weight_scale * layer.w2_input_scale).squeeze(),
+            g2_alphas=layer.output2_scales_scalar.squeeze(),
             a1_scale=layer.w13_input_scale,
             a1_gscale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
-            a2_gscale=1.0 / layer.w2_input_scale,
+            a2_gscale=layer.w2_input_scale_inv,
             per_act_token_quant=False,
         )
 
@@ -642,9 +650,9 @@ def apply(
         )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert not renormalize
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
+            assert activation in ("silu", "relu2_no_mul"), (
+                "Expected activation to be in ('silu', 'relu2_no_mul'),"
+                f"but got {activation}"
             )
             return flashinfer_cutlass_moe_fp8(
                 x,
@@ -1650,16 +1658,19 @@ def apply(
             use_llama4_routing = (
                 custom_routing_function is Llama4MoE.custom_routing_function
             )
-            routing_method_type = flashinfer.RoutingMethodType.DeepSeekV3
+            routing_method_type = layer.routing_method_type
             if use_llama4_routing:
-                routing_method_type = flashinfer.RoutingMethodType.Llama4
+                routing_method_type = RoutingMethodType.Llama4
+            router_logits = (
+                router_logits.to(torch.float32)
+                if routing_method_type == RoutingMethodType.DeepSeekV3
+                else router_logits
+            )
             routing_bias = e_score_correction_bias
             if routing_bias is not None:
                 routing_bias = routing_bias.to(torch.bfloat16)
             out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
-                routing_logits=router_logits
-                if use_llama4_routing
-                else router_logits.to(torch.float32),
+                routing_logits=router_logits,
                 routing_bias=routing_bias,
                 hidden_states=hidden_states_fp4,
                 hidden_states_scale=hidden_states_scale_linear_fp4.view(
@@ -1683,8 +1694,8 @@ def apply(
                 output2_scale_scalar=layer.g2_alphas.data,
                 num_experts=global_num_experts,
                 top_k=top_k,
-                n_group=num_expert_group if num_expert_group is not None else 0,
-                topk_group=topk_group if topk_group is not None else 0,
+                n_group=num_expert_group,
+                topk_group=topk_group,
                 intermediate_size=layer.intermediate_size_per_partition,
                 local_expert_offset=layer.ep_rank * layer.local_num_experts,
                 local_num_experts=layer.local_num_experts,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index e339f15510d7..b95d1a6b3a1f 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -190,14 +190,25 @@ def get_quant_method(
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedLinearMethod()
-            raise NotImplementedError("Mxfp4 linear layer is not implemented")
+            # TODO: Add support for MXFP4 Linear Method.
+            # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
+            # if you are interested in enabling MXFP4 here.
+            logger.warning_once(
+                "MXFP4 linear layer is not implemented - falling back to "
+                "UnquantizedLinearMethod."
+            )
+            return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
             if current_platform.is_xpu():
                 return IpexMxfp4MoEMethod(layer.moe_config)
             else:
                 return Mxfp4MoEMethod(layer.moe_config)
         elif isinstance(layer, Attention):
-            raise NotImplementedError("Mxfp4 attention layer is not implemented")
+            # TODO: Add support for MXFP4 Attention.
+            logger.warning_once(
+                "MXFP4 attention layer is not implemented. "
+                "Skipping quantization for this layer."
+            )
         return None
 
 
@@ -205,6 +216,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+        self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
         )
@@ -741,15 +753,10 @@ def _interleave_mxfp4_cutlass_sm90(w):
                 weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
             )
 
-            self.w13_weight_triton_tensor = w13_weight
-            self.w2_weight_triton_tensor = w2_weight
-
-            # need to delete the original weights to save memory on single GPU
-            del layer.w13_weight
-            del layer.w2_weight
-            layer.w13_weight = None
-            layer.w2_weight = None
-            torch.cuda.empty_cache()
+            self.w13_weight = w13_weight
+            self.w2_weight = w2_weight
+            layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False)
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 
@@ -824,18 +831,6 @@ def select_gemm_impl(
                     "EP batched experts format"
                 )
         else:
-            layer.w13_weight = (
-                self.w13_weight_triton_tensor
-                if layer.w13_weight is None
-                else layer.w13_weight
-            )
-            layer.w2_weight = (
-                self.w2_weight_triton_tensor
-                if layer.w2_weight is None
-                else layer.w2_weight
-            )
-            assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
-
             assert self.moe_quant_config is not None
             if (
                 self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
@@ -1070,8 +1065,8 @@ def apply(
 
             return triton_kernel_moe_forward(
                 hidden_states=x,
-                w1=self.w13_weight_triton_tensor,
-                w2=self.w2_weight_triton_tensor,
+                w1=self.w13_weight,
+                w2=self.w2_weight,
                 gating_output=router_logits,
                 topk=top_k,
                 renormalize=renormalize,
@@ -1150,7 +1145,7 @@ def apply(
     ) -> torch.Tensor:
         assert activation == "swigluoai", (
             "Only swiglu_oai activation is supported for IPEX MXFP4 MoE"
-        )  # noqa:
+        )
         hidden_size_pad = round_up(self.original_hidden_size, 128)
         x_pad = torch.nn.functional.pad(x, (0, hidden_size_pad - x.size(-1)))
         hidden_states = layer.ipex_fusion(
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index d5459594b798..f59e5e2a0af7 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
-from typing import Any, Optional, cast
+from typing import TYPE_CHECKING, Any, Optional, cast
 
 import torch
 
@@ -32,8 +32,12 @@
     deep_compare,
     should_ignore_layer,
 )
+from vllm.model_executor.models.utils import WeightsMapper
 from vllm.platforms import current_platform
 
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
 __all__ = ["QuarkLinearMethod"]
 
 logger = init_logger(__name__)
@@ -68,6 +72,33 @@ def get_min_capability(cls) -> int:
     def get_name(self) -> QuantizationMethods:
         return "quark"
 
+    def apply_vllm_mapper(  # noqa: B027
+        self, hf_to_vllm_mapper: "WeightsMapper"
+    ):
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the vllm model structure
+
+        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to vllm model structure
+        """
+        quant_config_with_hf_to_vllm_mapper = {}
+
+        for k, v in self.quant_config.items():
+            if isinstance(v, list):
+                quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_list(v)
+            elif isinstance(v, dict):
+                quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_dict(v)
+            else:
+                if isinstance(v, str):
+                    mapped_v_list = hf_to_vllm_mapper.apply_list([v])
+                    if mapped_v_list:
+                        quant_config_with_hf_to_vllm_mapper[k] = mapped_v_list[0]
+                else:
+                    quant_config_with_hf_to_vllm_mapper[k] = v
+
+        self.quant_config = quant_config_with_hf_to_vllm_mapper
+
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
@@ -114,7 +145,14 @@ def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
             layer_quant_names = list(layer_quant_config.keys())
             layer_quant_set = set(layer_quant_names)
 
-            if not kv_cache_set.issubset(layer_quant_set):
+            if not (
+                kv_cache_set.issubset(layer_quant_set)
+                or any(
+                    fnmatch.fnmatchcase(layer_quant, pat)
+                    for layer_quant in list(layer_quant_set)
+                    for pat in list(kv_cache_set)
+                )
+            ):
                 raise ValueError(
                     "The Quark quantized model has the "
                     "kv_cache_group parameter setting, "
@@ -124,10 +162,15 @@ def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
                 )
 
             q_configs = [
-                cast(dict[str, Any], layer_quant_config.get(name))
-                for name in kv_cache_group
+                quant_cfg
+                for name, quant_cfg in layer_quant_config.items()
+                if any(fnmatch.fnmatchcase(name, pattern) for pattern in kv_cache_group)
             ]
-            if not all(deep_compare(q_config, q_configs[0]) for q_config in q_configs):
+
+            if not all(
+                deep_compare(q_config["output_tensors"], q_configs[0]["output_tensors"])
+                for q_config in q_configs
+            ):
                 raise ValueError(
                     "The quantization method used for kv_cache should "
                     "be the same, but the quantization method for the "
@@ -312,9 +355,15 @@ def _find_matched_config(
             layer_quant_config = cast(
                 dict[str, Any], self.quant_config.get("layer_quant_config")
             )
-            for name_pattern in layer_quant_config:
-                if fnmatch.fnmatch(layer_name, name_pattern):
-                    return layer_quant_config[name_pattern]
+
+            def _matches_pattern(layer_name, pattern):
+                if "*" not in pattern:
+                    return layer_name in pattern
+                return fnmatch.fnmatch(layer_name, pattern)
+
+            for name_pattern, config in layer_quant_config.items():
+                if _matches_pattern(layer_name, name_pattern):
+                    return config
 
             layer_type = cast(str, type(module))
             layer_type_quant_config = cast(
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index eca6b0cb1d8e..30772c3665b0 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -8,6 +8,7 @@
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
     FusedMoE,
@@ -21,10 +22,6 @@
     ocp_mx_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_moe_enabled,
-    use_mxfp4_aiter_moe,
-)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_moe_fp8_layer_for_marlin,
 )
@@ -122,7 +119,7 @@ def __init__(
         if current_platform.is_rocm():
             self.use_marlin = False
 
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
 
     def create_weights(
         self,
@@ -309,12 +306,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 )
         # Property to determine if AITER is used
         if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
-                shuffle_weights,
-            )
-
             # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = shuffle_weights(
+            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                 layer.w13_weight.data, layer.w2_weight.data
             )
 
@@ -470,13 +463,15 @@ def __init__(
                 "not implemented. Please open an issue."
             )
 
+        self.use_rocm_aiter_moe = rocm_aiter_ops.is_fused_moe_enabled()
+
         self.emulate = not current_platform.supports_mx() or not (
-            use_mxfp4_aiter_moe() and self.ocp_mx_scheme == "w_mxfp4_a_mxfp4"
+            self.use_rocm_aiter_moe and self.ocp_mx_scheme == "w_mxfp4_a_mxfp4"
         )
         if self.emulate:
             logger.warning_once(
                 f"The current mode (supports_mx={current_platform.supports_mx()}, "
-                f"use_mxfp4_aiter_moe={use_mxfp4_aiter_moe()}, "
+                f"use_mxfp4_aiter_moe={self.use_rocm_aiter_moe}, "
                 f"ocp_mx_scheme={self.ocp_mx_scheme}) "
                 "does not support native MXFP4/MXFP6 "
                 "computation. Simulated weight dequantization and activation "
@@ -656,28 +651,18 @@ def apply(
         )
 
         if not self.emulate:
-            from aiter import ActivationType, QuantType
-            from aiter.fused_moe import fused_moe
-
-            aiter_acts = {
-                ActivationType.No.name.lower(): ActivationType.No,
-                ActivationType.Silu.name.lower(): ActivationType.Silu,
-                ActivationType.Gelu.name.lower(): ActivationType.Gelu,
-            }
-            assert activation in aiter_acts, (
-                f"Aiter CK fp4 MoE doesn't support activation {activation}"
-            )
-            out = fused_moe(
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+                rocm_aiter_fused_experts,
+            )
+
+            out = rocm_aiter_fused_experts(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                quant_type=QuantType.per_1x32,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                activation=aiter_acts[activation],
-                doweight_stage1=False,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                quant_config=self.moe_quant_config,
             )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index c25c522dea55..007e78e68d5c 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -31,6 +31,13 @@
 logger = init_logger(__name__)
 
 
+# TODO: move registration of custom op to aiter_ops.py
+# `from vllm._aiter_ops import rocm_aiter_ops`
+# use `rocm_aiter_ops.is_asm_fp4_gemm_dynamic_quant_enabled()`
+# for envs checks which does not require @cache anymore.
+# triton kernel is torch compile compatible.
+# does not require direct registeration.
+# use `rocm_aiter_ops.triton_fp4_gemm_dynamic_qaunt`.
 @cache
 def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
     return (
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 50ea049c3d5a..f22e17945d1f 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
     create_flashinfer_prepare_finalize,
 )
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -27,20 +28,25 @@ class FlashinferMoeBackend(Enum):
 
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
+    from flashinfer import next_positive_power_of_2
+
     # FlashInfer 0.2.10 has issues with larger tile sizes. Set to 8 for now.
     # TODO: Revert this to dynamic calculation once a new version of FlashInfer
     # with the necessary kernels is released.
     tile_tokens_dim = 8
 
-    # from flashinfer import next_positive_power_of_2
-
-    # # Guess tokens per expert assuming perfect expert distribution first.
-    # num_tokens_per_expert = (num_tokens * top_k) // num_experts
-    # # And pad the number to the next power of 2.
-    # tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # # Cap to 8-64 tokens per CTA tile as it's the range supported by the
-    # # kernel.
-    # tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    # A factor considering tokens are not perfectly balanced among experts.
+    imbalance_factor = 1.3
+    # Calculate the number of tokens per expert
+    # assuming perfect distribution.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # Apply the imbalance factor.
+    num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+    # Cap to 8-max_tile_tokens_dim tokens per CTA tile
+    # as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
 
     return tile_tokens_dim
 
@@ -185,17 +191,22 @@ def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
 
 
 def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-    moe: FusedMoEConfig | None,
+    moe: FusedMoEConfig | None, use_deepseek_fp8_block_scale: bool = False
 ) -> mk.FusedMoEPrepareAndFinalize:
     """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
     use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False
-    return create_flashinfer_prepare_finalize(use_dp)
+    # Propagate block-scale flag so prepare/finalize can skip act quantization
+    # and inform the kernel to consume per-block weight scales.
+    return create_flashinfer_prepare_finalize(
+        use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+    )
 
 
 def select_cutlass_fp8_gemm_impl(
     moe: FusedMoEConfig | None,
     quant_config: FusedMoEQuantConfig,
     out_dtype: torch.dtype | None = None,
+    use_deepseek_fp8_block_scale: bool = False,
 ) -> mk.FusedMoEPermuteExpertsUnpermute:
     """Return a GEMM *experts* implementation for fused-MoE layers"""
 
@@ -207,12 +218,14 @@ def select_cutlass_fp8_gemm_impl(
             ep_size=moe.moe_parallel_config.ep_size,
             tp_rank=moe.moe_parallel_config.tp_rank,
             tp_size=moe.moe_parallel_config.tp_size,
+            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         )
 
     assert out_dtype is not None, "If moe config is None, out_dtype must be passed"
     return FlashInferExperts(
         out_dtype=out_dtype,
         quant_config=quant_config,
+        use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
     )
 
 
@@ -226,14 +239,22 @@ def flashinfer_cutlass_moe_fp8(
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
+    use_deepseek_fp8_block_scale: bool = False,
+    moe: FusedMoEConfig | None = None,
 ) -> torch.Tensor:
     quant_config = layer.quant_method.get_fused_moe_quant_config(layer)
     assert quant_config is not None
 
+    # Construct modular kernel with block-scale support when requested.
     fused_experts = mk.FusedMoEModularKernel(
-        build_flashinfer_fp8_cutlass_moe_prepare_finalize(moe=None),
+        build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+            moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+        ),
         select_cutlass_fp8_gemm_impl(
-            moe=None, quant_config=quant_config, out_dtype=hidden_states.dtype
+            moe=moe,
+            quant_config=quant_config,
+            out_dtype=hidden_states.dtype,
+            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         ),
     )
 
@@ -253,7 +274,10 @@ def flashinfer_cutlass_moe_fp8(
 
 def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
     flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-    if flashinfer_moe_backend == "throughput":
+    # Prefer CUTLASS on SM90 to cover both SM90/SM100 generations
+    if flashinfer_moe_backend == "throughput" or current_platform.is_device_capability(
+        90
+    ):
         return FlashinferMoeBackend.CUTLASS
     elif flashinfer_moe_backend == "latency":
         return FlashinferMoeBackend.TENSORRT_LLM
@@ -267,5 +291,8 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
 
 def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool:
     # TODO(shuw@nvidia): Update when new backends are added.
-    backends_supporting_global_sf = (FlashinferMoeBackend.CUTLASS,)
+    backends_supporting_global_sf = (
+        FlashinferMoeBackend.CUTLASS,
+        FlashinferMoeBackend.TENSORRT_LLM,
+    )
     return backend in backends_supporting_global_sf
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 7fecda2166ef..ae63b4a76726 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -12,6 +12,7 @@
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -33,6 +34,7 @@
     is_deep_gemm_e8m0_used,
     is_deep_gemm_supported,
     should_use_deepgemm_for_fp8_linear,
+    transform_sf_into_required_layout,
 )
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -54,89 +56,13 @@ def cutlass_scaled_mm(
     Bs: torch.Tensor,
     block_size: list[int],
     output_dtype: torch.dtype = torch.float16,
-    is_hopper: bool | None = None,
 ) -> torch.Tensor:
-    if is_hopper is None:
-        is_hopper = current_platform.is_device_capability(90)
     return ops.cutlass_scaled_mm(
         A,
         B.T,
         out_dtype=output_dtype,
         scale_a=As,
-        # SM90 block FP8 requires row-major scale_b, which we do ahead of time
-        scale_b=Bs if block_size is not None and is_hopper else Bs.T,
-    )
-
-
-def rocm_aiter_gemm_w8a8_blockscale_impl(
-    input_2d: torch.Tensor,
-    weight: torch.Tensor,
-    input_scale: torch.Tensor,
-    weight_scale: torch.Tensor,
-    group_size: int,
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    def is_aiter_triton_kernel_tuned(n, k):
-        return (n, k) in [
-            (1024, 8192),
-            (2112, 7168),
-            (3072, 1536),
-            (32768, 8192),
-            (4096, 7168),
-            (4608, 7168),
-            (512, 7168),
-            (7168, 2048),
-            (7168, 256),
-            (8192, 1024),
-            (8192, 32768),
-        ]
-
-    n, k = weight.shape
-    if input_scale is not None:
-        q_input = input_2d
-    elif not current_platform.is_fp8_fnuz() and is_aiter_triton_kernel_tuned(n, k):
-        from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
-
-        # MI350 case uses triton kernel
-        q_input, input_scale = per_token_group_quant_fp8(
-            input_2d,
-            group_size,
-            column_major_scales=False,
-            use_ue8m0=False,
-        )
-    else:
-        # MI300 uses tuned AITER ASM/C++ kernel
-        import aiter as rocm_aiter
-        from aiter import gemm_a8w8_blockscale, get_hip_quant
-
-        aiter_per1x128_quant = get_hip_quant(rocm_aiter.QuantType.per_1x128)
-        q_input, input_scale = aiter_per1x128_quant(
-            input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8
-        )
-
-    return gemm_a8w8_blockscale(
-        q_input, weight, input_scale, weight_scale, dtype=output_dtype
-    )
-
-
-def rocm_aiter_gemm_w8a8_blockscale_fake(
-    input_2d: torch.Tensor,
-    weight: torch.Tensor,
-    input_scale: torch.Tensor,
-    weight_scale: torch.Tensor,
-    group_size: int,
-    output_dtype: torch.dtype = torch.float16,
-) -> torch.Tensor:
-    m = input_2d.shape[0]
-    n = weight.shape[0]
-    return torch.empty(m, n, dtype=output_dtype, device=input_2d.device)
-
-
-if current_platform.is_rocm():
-    direct_register_custom_op(
-        op_name="rocm_aiter_gemm_w8a8_blockscale",
-        op_func=rocm_aiter_gemm_w8a8_blockscale_impl,
-        fake_impl=rocm_aiter_gemm_w8a8_blockscale_fake,
+        scale_b=Bs.T,
     )
 
 
@@ -190,20 +116,27 @@ def _padded_cutlass(
         dim if dim % pad_multiple == 0 else dim + pad_multiple - (dim % pad_multiple)
     )
 
-    padded_shape = [padded, *qx.shape[1:]]
-    padded_qx = torch.zeros(padded_shape, device=qx.device, dtype=qx.dtype)
-    padded_qx[0 : qx.shape[0], ...].copy_(qx)
+    has_pad = padded > dim
 
-    padded_x_scale_shape = [*x_scale.shape[1:], padded]
-    padded_x_scale = torch.ones(
-        padded_x_scale_shape, device=x_scale.device, dtype=x_scale.dtype
-    ).permute(-1, -2)
-    padded_x_scale[0 : x_scale.shape[0], ...].copy_(x_scale)
+    if has_pad:
+        padded_shape = [padded, *qx.shape[1:]]
+        padded_qx = torch.zeros(padded_shape, device=qx.device, dtype=qx.dtype)
+        padded_qx[0 : qx.shape[0], ...].copy_(qx)
 
-    output = cutlass_scaled_mm(
-        padded_qx, weight, padded_x_scale, weight_scale, block_size, output_dtype, True
-    )
-    return output[0 : qx.shape[0], ...]
+        padded_x_scale_shape = [*x_scale.shape[1:], padded]
+        padded_x_scale = torch.ones(
+            padded_x_scale_shape, device=x_scale.device, dtype=x_scale.dtype
+        ).permute(-1, -2)
+        padded_x_scale[0 : x_scale.shape[0], ...].copy_(x_scale)
+
+        output = cutlass_scaled_mm(
+            padded_qx, weight, padded_x_scale, weight_scale, block_size, output_dtype
+        )
+        return output[0 : qx.shape[0], ...]
+    else:
+        return cutlass_scaled_mm(
+            qx, weight, x_scale, weight_scale, block_size, output_dtype
+        )
 
 
 def _padded_cutlass_fake(
@@ -374,7 +307,6 @@ def _run_cutlass(
                 weight_scale,
                 list(self.weight_group_shape),
                 input_2d.dtype,
-                False,
             )
 
     def _run_aiter(
@@ -385,13 +317,40 @@ def _run_aiter(
         input_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         assert self.act_quant_group_shape == GroupShape(1, 128)
-        return torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale(
-            input_2d,
+
+        n, k = weight.shape
+
+        use_triton = (
+            not current_platform.is_fp8_fnuz()
+            and rocm_aiter_ops.is_triton_gemm_w8a8_tuned(n, k)
+        )
+
+        if use_triton:
+            gemm_a8w8_blockscale_op = rocm_aiter_ops.triton_gemm_a8w8_blockscale
+        else:
+            gemm_a8w8_blockscale_op = rocm_aiter_ops.gemm_a8w8_blockscale
+
+        if input_scale is not None:
+            q_input = input_2d
+        # MI350 case uses triton kernel
+        elif use_triton:
+            q_input, input_scale = per_token_group_quant_fp8(
+                input_2d,
+                self.act_quant_group_shape.col,
+                column_major_scales=False,
+                use_ue8m0=False,
+            )
+        # MI300 uses tuned AITER ASM/C++ kernel
+        else:
+            q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d)
+
+        return gemm_a8w8_blockscale_op(
+            q_input,
             weight,
             input_scale,
             weight_scale,
-            self.act_quant_group_shape.col,
-            input_2d.dtype,
+            list(self.weight_group_shape),
+            output_dtype=input_2d.dtype,
         )
 
     def _run_triton(
@@ -971,14 +930,49 @@ def requant_weight_ue8m0_inplace(
         s_old.copy_(s_requant)
 
 
-def check_aiter_fp8_linear_support() -> bool:
-    """AITER is only supported on ROCm for MI3XX"""
-    return (
-        current_platform.is_rocm()
-        and envs.VLLM_ROCM_USE_AITER
-        and envs.VLLM_ROCM_USE_AITER_LINEAR
+def deepgemm_post_process_fp8_weight_block(
+    wq: torch.Tensor, ws: torch.Tensor, quant_block_shape: tuple[int], use_e8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert wq.dtype == torch.float8_e4m3fn, (
+        "Expected quantized tensor dtype "
+        f"to be torch.float8_e4m3fn, got {wq.dtype} instead."
+    )
+    assert ws.dtype == torch.float32, (
+        f"Expected tensor scales dtype to be torch.float32, got {ws.dtype} instead"
     )
 
+    if use_e8m0:
+        requant_weight_ue8m0_inplace(wq, ws, block_size=quant_block_shape)
+
+    original_ndim = wq.ndim
+    if wq.ndim == 2:
+        assert ws.ndim == 2
+        wq = wq.unsqueeze(0)
+        ws = ws.unsqueeze(0)
+
+    # From https://github.com/deepseek-ai/DeepGEMM/blob/c9f8b34dcdacc20aa746b786f983492c51072870/csrc/utils/layout.hpp#L46
+    recipe = (1, 128, 128)
+
+    # Ref : https://github.com/deepseek-ai/DeepGEMM/blob/c9f8b34dcdacc20aa746b786f983492c51072870/csrc/apis/gemm.hpp
+    # DeepGemm uses the `transform_sf_into_required_layout` function to
+    # represent scales in the correct format.
+    dg_ws = transform_sf_into_required_layout(
+        sf=ws,
+        mn=wq.size(1),
+        k=wq.size(2),
+        recipe=recipe,
+        num_groups=wq.size(0),
+        # is the scale factors for A in (Refers to the argument A in A @ B).
+        # Weights are B.
+        is_sfa=False,
+    )
+
+    if original_ndim == 2:
+        wq = wq.squeeze(0)
+        dg_ws = dg_ws.squeeze(0)
+
+    return wq, dg_ws
+
 
 def _maybe_pad_fp8_weight(weight: torch.Tensor) -> torch.Tensor:
     """Pad the weight tensor. This is an optimization on ROCm platform, which
@@ -1178,9 +1172,7 @@ def process_fp8_weight_block_strategy(
     return weight, weight_scale
 
 
-def maybe_post_process_fp8_weight_block(
-    layer: torch.nn.Module, cutlass_block_fp8_supported: bool
-):
+def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
     assert layer.weight_block_size is not None
 
     from vllm.utils.deep_gemm import (
@@ -1194,20 +1186,15 @@ def maybe_post_process_fp8_weight_block(
     should_use_deepgemm = should_use_deepgemm_for_fp8_linear(
         layer.orig_dtype, layer.weight
     )
-    if is_deep_gemm_e8m0_used() and should_use_deepgemm:
-        block_sz = tuple(layer.weight_block_size)
-        requant_weight_ue8m0_inplace(
-            layer.weight.data, layer.weight_scale.data, block_sz
-        )
-    # SM90 Block FP8 CUTLASS requires row-major weight scales
-    elif (
-        current_platform.is_device_capability(90)
-        and cutlass_block_fp8_supported
-        and not should_use_deepgemm
-    ):
-        layer.weight_scale = torch.nn.Parameter(
-            layer.weight_scale.data.T.contiguous(), requires_grad=False
+    if should_use_deepgemm:
+        dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
+            wq=layer.weight.data,
+            ws=layer.weight_scale.data,
+            quant_block_shape=tuple(layer.weight_block_size),
+            use_e8m0=is_deep_gemm_e8m0_used(),
         )
+        layer.weight = torch.nn.Parameter(dg_weight, requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(dg_weight_scale, requires_grad=False)
 
 
 def expert_weight_is_col_major(x: torch.Tensor) -> bool:
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 380431e86435..fceed3e55c2d 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -13,6 +13,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm, has_flashinfer
+from vllm.utils.platform_utils import get_cu_count
 from vllm.utils.torch_utils import direct_register_custom_op
 
 # Input scaling factors are no longer optional in _scaled_mm starting
@@ -200,7 +201,7 @@ def rocm_per_tensor_w8a8_scaled_mm_impl(
             out_dtype,
             scale_a,
             scale_b,
-            current_platform.get_cu_count(),
+            get_cu_count(),
             bias,
         )
     else:
@@ -472,7 +473,7 @@ def apply(
         # Example:
         # When the number of token is 1, per-token scale is [[1]]
         # When per-tensor scale is [1] or ().
-        per_tensor_weights = (weight_scale.numel() == 1) and weight_scale.dim() < 2
+        per_tensor_weights = weight_scale.numel() == 1
         per_tensor_activations = (x_scale.numel() == 1) and x_scale.dim() < 2
 
         # TODO(luka) do this dispatch during init (after ScaledMM refactor)
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 91276320df4d..4114b21168cc 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -4,13 +4,10 @@
 
 import torch
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.model_executor.custom_op import CustomOp
 
 from .common import apply_rotary_emb_torch
-from .rocm_aiter_rope_ops import (
-    is_rocm_triton_rotary_embedding_enabled,
-    rocm_aiter_rotary_emb,
-)
 
 
 @CustomOp.register("rotary_embedding")
@@ -48,8 +45,8 @@ def __init__(
             cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
-        self.is_rocm_triton_rotary_embedding_enabled = (
-            is_rocm_triton_rotary_embedding_enabled()
+        self.is_rocm_triton_rotary_embed_enabled = (
+            rocm_aiter_ops.is_triton_rotary_embed_enabled()
         )
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
@@ -86,6 +83,11 @@ def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None:
         ):
             self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
 
+    def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]:
+        cos_sin = self.cos_sin_cache[:seqlen]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        return cos, sin
+
 
 class RotaryEmbedding(RotaryEmbeddingBase):
     def __init__(
@@ -101,35 +103,56 @@ def __init__(
             head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
         )
 
-    def forward_native(
-        self,
+    @staticmethod
+    def forward_static(
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: torch.Tensor | None = None,
+        key: torch.Tensor | None,
+        head_size: int,
+        rotary_dim: int,
+        cos_sin_cache: torch.Tensor,
+        is_neox_style: bool,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """A PyTorch-native implementation of forward()."""
         positions = positions.flatten()
         num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos_sin = cos_sin_cache.index_select(0, positions)
         cos, sin = cos_sin.chunk(2, dim=-1)
 
         query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., : self.rotary_dim]
-        query_pass = query[..., self.rotary_dim :]
-        query_rot = apply_rotary_emb_torch(query_rot, cos, sin, self.is_neox_style)
+        query = query.view(num_tokens, -1, head_size)
+        query_rot = query[..., :rotary_dim]
+        query_pass = query[..., rotary_dim:]
+        query_rot = apply_rotary_emb_torch(query_rot, cos, sin, is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         # key may be None in some cases, e.g. cross-layer KV sharing
         if key is not None:
             key_shape = key.shape
-            key = key.view(num_tokens, -1, self.head_size)
-            key_rot = key[..., : self.rotary_dim]
-            key_pass = key[..., self.rotary_dim :]
-            key_rot = apply_rotary_emb_torch(key_rot, cos, sin, self.is_neox_style)
+            key = key.view(num_tokens, -1, head_size)
+            key_rot = key[..., :rotary_dim]
+            key_pass = key[..., rotary_dim:]
+            key_rot = apply_rotary_emb_torch(key_rot, cos, sin, is_neox_style)
             key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """A PyTorch-native implementation of forward()."""
+        return self.forward_static(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.rotary_dim,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
+
     def forward_cuda(
         self,
         positions: torch.Tensor,
@@ -169,9 +192,9 @@ def forward_hip(
         query: torch.Tensor,
         key: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        if self.is_rocm_triton_rotary_embedding_enabled:
+        if self.is_rocm_triton_rotary_embed_enabled:
             self._match_cos_sin_cache_dtype(query)
-            rocm_aiter_rotary_emb(
+            rocm_aiter_ops.triton_rotary_embed(
                 positions,
                 query,
                 key,
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index d9134f05fddf..e72834e473c1 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -146,6 +146,15 @@ def forward_native(
             key = key_rot
         return query, key
 
+    def forward_hip(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return self.forward_native(positions, query, key, offsets)
+
     def forward_cuda(
         self,
         positions: torch.Tensor,
diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
deleted file mode 100644
index a01d14f7b3a1..000000000000
--- a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-import vllm.envs as envs
-from vllm.platforms import current_platform
-from vllm.utils.torch_utils import direct_register_custom_op
-
-
-def is_rocm_triton_rotary_embedding_enabled() -> bool:
-    return (
-        current_platform.is_rocm()
-        and envs.VLLM_ROCM_USE_AITER
-        and envs.VLLM_ROCM_USE_TRITON_ROPE
-    )
-
-
-def rocm_aiter_rotary_emb_with_key_forward_triton_impl(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    import aiter.ops.triton.rope as ops
-
-    ops.rope_cached_thd_positions_2c_fwd_inplace(
-        query,
-        key,
-        cos,
-        sin,
-        positions,
-        rotate_style,
-        reuse_freqs_front_part=True,
-        nope_first=is_nope_first,
-    )
-
-
-def rocm_aiter_rotary_emb_with_key_forward_triton_fake(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    pass
-
-
-if is_rocm_triton_rotary_embedding_enabled():
-    direct_register_custom_op(
-        op_name="rocm_aiter_rotary_emb_with_key_forward_triton",
-        op_func=rocm_aiter_rotary_emb_with_key_forward_triton_impl,
-        mutates_args=["key", "query"],
-        fake_impl=rocm_aiter_rotary_emb_with_key_forward_triton_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-
-def rocm_aiter_rotary_emb(
-    positions: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    cos_sin_cache: torch.Tensor,
-    head_size: int,
-    rotary_dim: int,
-    is_neox_style: bool,
-):
-    num_tokens = positions.numel()
-    cos, sin = cos_sin_cache.chunk(2, dim=-1)
-    query_shape = query.shape
-    key_shape = key.shape
-    rotate_style = 0 if is_neox_style else 1
-
-    query = query.view(num_tokens, -1, head_size)
-    key = key.view(num_tokens, -1, head_size)
-    query_ = query[..., :rotary_dim]
-    key_ = key[..., :rotary_dim]
-    positions = positions.view(*query.shape[:1])
-    torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_triton(
-        positions,
-        sin,
-        cos,
-        query_,
-        key_,
-        rotate_style,
-        False,
-    )
-    query = query.view(query_shape)
-    key = key.view(key_shape)
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index b17bdd0b7207..4b7ba2eed94c 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -8,8 +8,10 @@
 
 from vllm import _custom_ops as ops
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
+from vllm.utils.platform_utils import get_cu_count
 from vllm.utils.torch_utils import direct_register_custom_op
 
 logger = init_logger(__name__)
@@ -105,8 +107,7 @@ def default_unquantized_gemm(
 
 def use_aiter_triton_gemm(n, m, k, dtype):
     if (
-        envs.VLLM_ROCM_USE_AITER == 0
-        or envs.VLLM_ROCM_USE_AITER_TRITON_GEMM == 0
+        not rocm_aiter_ops.is_triton_gemm_enabled()
         # MI300's - fp8nuz=True
         or current_platform.is_fp8_fnuz()
         or dtype not in [torch.float16, torch.bfloat16]
@@ -151,7 +152,7 @@ def rocm_unquantized_gemm_impl(
 
     x_view = x.reshape(-1, x.size(-1))
     if m > 8 and 0 < n <= 4:
-        cu_count = current_platform.get_cu_count()
+        cu_count = get_cu_count()
         out = ops.wvSplitK(weight, x_view, cu_count, bias)
         return out.reshape(*x.shape[:-1], weight.shape[0])
     elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None:
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 7db1fc167c4f..2416836be03c 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -7,10 +7,11 @@
 import torch
 import torch.nn as nn
 from huggingface_hub import hf_hub_download
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.config.load import LoadConfig
+from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (
     initialize_model,
@@ -21,8 +22,11 @@
     get_gguf_weight_type_map,
     gguf_quant_weights_iterator,
 )
+from vllm.transformers_utils.gguf_utils import detect_gguf_multimodal
 from vllm.utils.torch_utils import set_default_torch_dtype
 
+logger = init_logger(__name__)
+
 
 class GGUFModelLoader(BaseModelLoader):
     """
@@ -67,7 +71,15 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
         https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
         """
         config = model_config.hf_config
+        # Get text config to handle both nested (multimodal) and flat
+        # (text-only) config structures. For multimodal models like
+        # Gemma3Config, this returns config.text_config. For text-only
+        # models, this returns config itself.
+        text_config = config.get_text_config()
         model_type = config.model_type
+        is_multimodal = (
+            hasattr(config, "vision_config") and config.vision_config is not None
+        )
         gguf_to_hf_name_map = {}
         # hack: ggufs have a different name than transformers
         if model_type == "cohere":
@@ -115,24 +127,167 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
                 break
         if arch is None:
             raise RuntimeError(f"Unknown gguf model_type: {model_type}")
-        num_layers = config.num_hidden_layers
-        name_map = gguf.get_tensor_name_map(arch, num_layers)
+        text_num_layers = text_config.num_hidden_layers
+        text_name_map = gguf.get_tensor_name_map(arch, text_num_layers)
+
+        if is_multimodal:
+            mm_proj_arch = gguf.MODEL_ARCH.MMPROJ
+            vision_num_layers = config.vision_config.num_hidden_layers
+            vision_name_map = gguf.get_tensor_name_map(mm_proj_arch, vision_num_layers)
+        else:
+            vision_name_map = None
+
+        # Create dummy model to extract parameter names
+        # For multimodal: use AutoModelForImageTextToText to get
+        # language + vision + projector params
+        # For text-only: use AutoModelForCausalLM to get language model params
+        auto_cls = (
+            AutoModelForImageTextToText if is_multimodal else AutoModelForCausalLM
+        )
         with torch.device("meta"):
-            dummy_model = AutoModelForCausalLM.from_config(
+            dummy_model = auto_cls.from_config(
                 config, trust_remote_code=model_config.trust_remote_code
             )
+
         state_dict = dummy_model.state_dict()
+        if hf_checkpoint_map := getattr(
+            dummy_model, "_checkpoint_conversion_mapping", None
+        ):
+
+            def revert_hf_rename(name: str) -> str:
+                for original_name, hf_name in hf_checkpoint_map.items():
+                    if hf_name in name:
+                        name = name.replace(hf_name, original_name).lstrip("^")
+                return name
+
+            state_dict = {
+                revert_hf_rename(name): tensor for name, tensor in state_dict.items()
+            }
+
+        def find_hf_name_in_tensor_map(hf_name: str) -> str | None:
+            """
+            Map HuggingFace parameter name to GGUF tensor name.
+
+            This function handles the mismatch between HF parameter naming
+            conventions and gguf-py's expected format:
+            1. Strips 'model.' prefix (common in multimodal models)
+            2. Converts '_weight' suffix to '.weight' (Gemma3 compatibility)
+            3. Searches vision_name_map for multimodal parameters
+            4. Falls back to text_name_map for language model parameters
+
+            Args:
+                hf_name: Full HuggingFace parameter name (e.g.,
+                        'model.multi_modal_projector.mm_soft_emb_norm.weight')
+
+            Returns:
+                GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
+                or None if no mapping found
+            """
+            # Strip 'language_model.' prefix for multimodal models - gguf-py
+            # tensor mappings expect parameter names without this prefix.
+            # Note: 'model.' prefix should be KEPT for text-only models as
+            # gguf-py expects it.
+            if hf_name.startswith("language_model."):
+                hf_name = hf_name[15:]  # Remove 'language_model.'
+
+            # Parse parameter name and suffix
+            if hf_name.endswith((".weight", ".bias")):
+                base_name, suffix = hf_name.rsplit(".", 1)
+            else:
+                base_name, suffix = hf_name, ""
+                # Handle '_weight' suffix (Gemma3 naming: parameter ends with
+                # '_weight' instead of '.weight')
+                if base_name.endswith("_weight"):
+                    base_name = base_name[:-7]  # Remove '_weight'
+                    suffix = "weight"
+
+            gguf_name = None
+            # Priority 1: Search vision/projector parameters for multimodal models
+            if vision_name_map is not None:
+                gguf_name = vision_name_map.get_name(base_name)
+
+            # Priority 2: Search text backbone parameters
+            if gguf_name is None:
+                gguf_name = text_name_map.get_name(base_name)
+
+            if gguf_name is None:
+                return None
 
+            return gguf_name + "." + suffix
+
+        # Build mapping and track unmapped parameters
+        unmapped_params = []
         for hf_name in state_dict:
-            name, suffix = hf_name.rsplit(".", 1)
-            gguf_name = name_map.get_name(name)
-            gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name
+            gguf_name_with_suffix = find_hf_name_in_tensor_map(hf_name)
+
+            # Track mapping success
+            if gguf_name_with_suffix is not None:
+                gguf_to_hf_name_map[gguf_name_with_suffix] = hf_name
+                logger.debug("Mapped GGUF %s → HF %s", gguf_name_with_suffix, hf_name)
+            elif hf_name not in gguf_to_hf_name_map.values():
+                # Parameter not in manual overrides either
+                unmapped_params.append(hf_name)
+
+        # All parameters must be mapped: both vision/projector and backbone
+        if unmapped_params:
+            raise RuntimeError(
+                f"Failed to map GGUF parameters "
+                f"({len(unmapped_params)}): "
+                f"{unmapped_params}"
+            )
         return gguf_to_hf_name_map
 
+    def _get_gguf_weight_type(
+        self,
+        model_config: ModelConfig,
+        model_name_or_path: str,
+        gguf_to_hf_name_map: dict[str, str],
+    ) -> dict[str, str]:
+        weight_type_map = get_gguf_weight_type_map(
+            model_config.model, gguf_to_hf_name_map
+        )
+        is_multimodal = hasattr(model_config.hf_config, "vision_config")
+        if is_multimodal:
+            mmproj_file = detect_gguf_multimodal(model_name_or_path)
+            assert mmproj_file is not None, (
+                "Could not find mm_proj file for multimodal GGUF model"
+            )
+            logger.info("Loading extra mm_proj weights from %s...", mmproj_file)
+            mm_proj_weight_type_map = get_gguf_weight_type_map(
+                mmproj_file, gguf_to_hf_name_map
+            )
+            weight_type_map.update(mm_proj_weight_type_map)
+        return weight_type_map
+
     def _get_weights_iterator(
-        self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str]
+        self,
+        model_config: ModelConfig,
+        model_name_or_path: str,
+        gguf_to_hf_name_map: dict[str, str],
     ) -> Generator[tuple[str, torch.Tensor], None, None]:
-        return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
+        """
+        Iterate over GGUF model weights, loading from both main model file and
+        mmproj.gguf for multimodal Gemma3 models.
+
+        For Gemma3 multimodal GGUF models:
+        - Main file (gemma-3-*.gguf): Language model weights (model.*)
+        - mmproj file (mmproj*.gguf): Vision tower + projector weights (v.*, mm.*)
+
+        Yields:
+            Tuples of (parameter_name, tensor) for all model weights
+        """
+        hf_config = model_config.hf_config
+        is_multimodal = hasattr(hf_config, "vision_config")
+
+        if is_multimodal:
+            # Load mm_proj (mm_encoder + projector) for multimodal weights
+            mmproj_file = detect_gguf_multimodal(model_name_or_path)
+            assert mmproj_file is not None, (
+                "Could not find mm_proj file for multimodal GGUF model"
+            )
+            yield from gguf_quant_weights_iterator(mmproj_file, gguf_to_hf_name_map)
+
+        yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
 
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model)
@@ -141,7 +296,7 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         local_model_path = self._prepare_weights(model_config.model)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         model.load_weights(
-            self._get_weights_iterator(local_model_path, gguf_weights_map)
+            self._get_weights_iterator(model_config, local_model_path, gguf_weights_map)
         )
 
     def load_model(
@@ -156,14 +311,19 @@ def load_model(
         ):
             model_config.hf_config.update({"tie_word_embeddings": True})
 
-        weight_type_map = get_gguf_weight_type_map(model_config.model, gguf_weights_map)
-
+        weight_type_map = self._get_gguf_weight_type(
+            model_config, local_model_path, gguf_weights_map
+        )
         # filter out unquantized modules to skip
         unquant_names = [
             name.removesuffix(".weight")
             for name, weight_type in weight_type_map.items()
-            if weight_type == "F32" and name.endswith(".weight")
+            if weight_type in ("F32", "F16", "BF16") and name.endswith(".weight")
         ]
+        logger.debug(
+            "GGUF unquantized modules: %s",
+            unquant_names,
+        )
         vllm_config.quant_config.unquantized_modules.extend(unquant_names)
 
         target_device = torch.device(device_config.device)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 3dbe803f9986..89634cbf4124 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -82,7 +82,8 @@ def enable_hf_transfer():
 
 class DisabledTqdm(tqdm):
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs, disable=True)
+        kwargs["disable"] = True
+        super().__init__(*args, **kwargs)
 
 
 def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
@@ -835,7 +836,11 @@ def gguf_quant_weights_iterator(
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """
     Iterate over the quant weights in the model gguf files and convert
-    them to torch tensors
+    them to torch tensors.
+    Be careful of the order of yielding weight types and weights data,
+    we have to yield all weight types first before yielding any weights.
+    Otherwise it would cause issue when loading weights with for packed
+    layer with different quant types.
     """
 
     reader = gguf.GGUFReader(gguf_file)
@@ -845,7 +850,7 @@ def gguf_quant_weights_iterator(
             weight_type = tensor.tensor_type
             name = gguf_to_hf_name_map[tensor.name]
 
-            if weight_type.name != "F32":
+            if weight_type.name not in ("F32", "BF16", "F16"):
                 weight_type_name = name.replace("weight", "qweight_type")
                 weight_type = torch.tensor(weight_type)
                 yield weight_type_name, weight_type
@@ -855,7 +860,7 @@ def gguf_quant_weights_iterator(
             weight = tensor.data
             weight_type = tensor.tensor_type
             name = gguf_to_hf_name_map[tensor.name]
-            if weight_type.name != "F32":
+            if weight_type.name not in ("F32", "BF16", "F16"):
                 name = name.replace("weight", "qweight")
             param = torch.tensor(weight)
             yield name, param
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index f742090df71f..a9cc49451a1d 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -429,7 +429,7 @@ def load_weights_using_from_2_way_softmax(
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
         # have this attribute, we fallback to get_input_embeddings(), which is used by
-        # the Transformers backend.
+        # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
             if hasattr(model.model, "embed_tokens")
@@ -487,7 +487,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
         # have this attribute, we fallback to get_input_embeddings(), which is used by
-        # the Transformers backend.
+        # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
             if hasattr(model.model, "embed_tokens")
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
new file mode 100644
index 000000000000..6f654f47495f
--- /dev/null
+++ b/vllm/model_executor/models/afmoe.py
@@ -0,0 +1,711 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only AfMoE model compatible with HuggingFace weights."""
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
+from typing import Any
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.llama import LlamaMLP as AfmoeMLP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    WeightsMapper,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+class AfmoeMoE(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.route_scale = config.route_scale
+        self.score_func = config.score_func
+        self.route_norm = config.route_norm
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.num_experts
+        self.n_shared_experts: int = config.num_shared_experts
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        # Router gate
+        self.gate = nn.Linear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            dtype=torch.float32,
+        )
+        self.expert_bias = nn.Parameter(
+            torch.empty(config.num_experts, dtype=torch.float32)
+        )
+
+        # Load balancing settings
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        self.shared_experts = None
+        # Shared experts
+        if config.num_shared_experts > 0:
+            intermediate_size = config.moe_intermediate_size * config.num_shared_experts
+            self.shared_experts = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+
+        # Routed experts using SharedFusedMoE
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=self.route_norm if self.score_func == "sigmoid" else False,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=self.score_func,
+            routed_scaling_factor=self.route_scale,
+            e_score_correction_bias=self.expert_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits = self.gate(hidden_states.to(dtype=torch.float32))
+
+        fused_moe_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = fused_moe_out
+            final_hidden_states = final_hidden_states + shared_output
+        else:
+            final_hidden_states = fused_moe_out
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class AfmoeAttention(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        layer_idx: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        max_position_embeddings: int = 131072,
+        head_dim: int | None = None,
+        rms_norm_eps: float = 1e-05,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        # Check if this is a local attention layer
+        self.is_local_attention = config.layer_types[layer_idx] == "sliding_attention"
+        self.sliding_window = config.sliding_window if self.is_local_attention else None
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # Gating projection
+        self.gate_proj = ColumnParallelLinear(
+            hidden_size,
+            self.total_num_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_proj",
+        )
+
+        # Q/K normalization
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        # Only create rotary embeddings for local attention
+        if self.is_local_attention:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=rope_theta,
+                rope_scaling=rope_scaling,
+                is_neox_style=True,
+            )
+        else:
+            self.rotary_emb = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+            attn_type=attn_type,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        gate, _ = self.gate_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply Q/K normalization
+        q = self.q_norm(q.reshape(-1, self.num_heads, self.head_dim)).reshape(q.shape)
+        k = self.k_norm(k.reshape(-1, self.num_kv_heads, self.head_dim)).reshape(
+            k.shape
+        )
+
+        # Apply rotary embeddings only for local attention
+        if self.is_local_attention and self.rotary_emb is not None:
+            q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v)
+
+        # Apply gating
+        attn_output = attn_output * torch.sigmoid(gate)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class AfmoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
+
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        self.layer_idx = extract_layer_index(prefix)
+
+        self.self_attn = AfmoeAttention(
+            config=config,
+            layer_idx=self.layer_idx,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=config.head_dim,
+            rms_norm_eps=config.rms_norm_eps,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # MoE or dense FFN
+        self.moe_enabled = self.layer_idx >= config.num_dense_layers
+        if self.moe_enabled:
+            self.mlp = AfmoeMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
+        else:
+            self.mlp = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)  # attn norm b
+
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_layernorm(  # ffn norm a
+            hidden_states, residual
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)  # ffn norm b
+
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
+class AfmoeModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        enable_eplb = vllm_config.parallel_config.enable_eplb
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+        self.mup_enabled = config.mup_enabled
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size, config.hidden_size, prefix=f"{prefix}.embed_tokens"
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: AfmoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+
+            # Apply muP input scaling if enabled
+            if self.mup_enabled:
+                hidden_states = hidden_states * (self.config.hidden_size**0.5)
+
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def make_empty_intermediate_tensors(
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return SharedFusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if (weight_name not in name) or ("self_attn.gate_proj" in name):
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        name = name_mapped
+                        break
+                else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class AfmoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_suffix={
+            ".router.gate.weight": ".gate.weight",
+        },
+    )
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = AfmoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=quant_config
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = config.num_hidden_layers - config.num_dense_layers
+        self.num_expert_groups = config.n_group
+
+        self.moe_layers: list[SharedFusedMoE] = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, AfmoeDecoderLayer)
+            if layer.moe_enabled:
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None and self.num_moe_layers > 0:
+            raise RuntimeError("No AfmoeMoE layer found in model.layers.")
+
+        if example_moe is not None:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 72e5ddcf1abe..0a8f21abb0a3 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -49,7 +49,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -346,24 +345,18 @@ def __init__(
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -389,7 +382,7 @@ def __init__(
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -403,7 +396,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -518,9 +511,7 @@ def __init__(
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.config = config
-        self.lora_config = lora_config
 
         self.model = self._init_model(
             vllm_config=vllm_config,
@@ -529,20 +520,9 @@ def __init__(
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -551,7 +531,7 @@ def __init__(
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -577,8 +557,8 @@ def _init_model(
             vllm_config=vllm_config, prefix=prefix, layer_type=layer_type
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 08bf1a6aad75..20c3ff075450 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -23,7 +23,6 @@
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -200,7 +199,6 @@ def __init__(
         self.quant_config = quant_config
         self.config = config
         self.vocab_size = config.vocab_size
-        self.org_vocab_size = config.vocab_size
 
         # Word embeddings (parallelized if using pipeline parallel)
         if get_pp_group().is_first_rank or (
@@ -209,7 +207,6 @@ def __init__(
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -242,7 +239,7 @@ def __init__(
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -257,7 +254,7 @@ def forward(
             hidden_states = (
                 inputs_embeds
                 if inputs_embeds is not None
-                else self.get_input_embeddings(input_ids)
+                else self.embed_input_ids(input_ids)
             )
             residual = None
         else:
@@ -383,13 +380,10 @@ def __init__(self, *, vllm_config, prefix: str = "") -> None:
         if get_pp_group().is_last_rank:
             # Determine vocabulary size (including any LoRA extra tokens
             # for padded LM head)
-            self.unpadded_vocab_size = config.vocab_size
 
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=vllm_config.quant_config,
                 bias=getattr(config, "lm_head_bias", False),
                 prefix=f"{prefix}.lm_head",
@@ -399,7 +393,7 @@ def __init__(self, *, vllm_config, prefix: str = "") -> None:
                 self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             # Placeholder for lm_head on non-last ranks
@@ -429,8 +423,8 @@ def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         """Load weights into the model (delegates to inner model and handles
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index bb505219ea17..b5cc07a56535 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -442,7 +442,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -456,7 +456,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -490,16 +490,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head.weight = self.model.embed_tokens.weight
         self.num_experts = config.num_local_experts
         self.num_experts_per_tok = config.num_experts_per_tok
-        self.unpadded_vocab_size = config.vocab_size
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 222a42579054..3d07e6b612ca 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -547,18 +547,14 @@ def __init__(
         self.pad_token_id = (
             self.config.pad_token_id if self.config.pad_token_id is not None else -1
         )
-        self.unpadded_vocab_size = config.text_config.vocab_size
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            self.vocab_size,
             config.text_config.hidden_size,
-            org_num_embeddings=self.language_model.org_vocab_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, self.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(self.vocab_size, scale=logit_scale)
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
@@ -617,7 +613,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -633,8 +629,8 @@ def forward(
         **kwargs: object,
     ) -> torch.Tensor | IntermediateTensors:
         if inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            multimodal_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 multimodal_embeddings,
                 is_multimodal=input_ids == self.config.image_token_index,
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 839ab5947e09..0ada2ed5028b 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -417,7 +417,7 @@ def _parse_and_validate_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 39990b9fd683..8991ef4c606b 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -309,7 +309,7 @@ def __init__(
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -323,7 +323,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -402,9 +402,9 @@ def __init__(
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config
         self.model = BaiChuanModel(
@@ -426,8 +426,8 @@ def __init__(
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 1549c653482f..024425bb2440 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -39,7 +39,6 @@
     get_pp_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
-    tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
@@ -330,7 +329,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             final_hidden_states = final_hidden_states + shared_output
 
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
         return final_hidden_states.view(num_tokens, hidden_size)
 
 
@@ -438,7 +439,7 @@ def __init__(
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.word_embeddings(input_ids)
 
     def forward(
@@ -452,7 +453,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -581,10 +582,8 @@ def __init__(
         config = vllm_config.model_config.hf_config.get_text_config()
         vllm_config.model_config.hf_config = config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
         self.quant_config = quant_config
         self.max_position_embeddings = config.max_position_embeddings
         self.model = BailingMoeModel(
@@ -600,7 +599,7 @@ def __init__(
                     config.vocab_size,
                     config.hidden_size,
                     quant_config=quant_config,
-                    prefix=f"{prefix}.lm_head",
+                    prefix=maybe_prefix(prefix, "lm_head"),
                 )
             self.logits_processor = LogitsProcessor(config.vocab_size)
         else:
@@ -610,8 +609,8 @@ def __init__(
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index bc7dbb618f65..e0a2defd5127 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -30,7 +30,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -284,21 +283,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         def get_layer(prefix: str):
@@ -322,7 +314,7 @@ def get_layer(prefix: str):
 
         self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -336,7 +328,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -478,7 +470,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
         self.quant_config = vllm_config.quant_config
 
@@ -488,31 +480,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = BambaModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 1c2334a78543..2679448bce77 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -375,7 +375,7 @@ def __init__(
         self.embeddings = embedding_class(self.config)
         self.encoder = BertEncoder(vllm_config=vllm_config, prefix=f"{prefix}.encoder")
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings.word_embeddings(input_ids)
 
     def forward(
@@ -486,8 +486,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.pooler = self._build_pooler(pooler_config)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -835,8 +835,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.bert.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.bert.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
@@ -893,8 +893,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.bert.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.bert.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 31fdc4d21245..131cb68914cf 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -463,7 +463,7 @@ def __init__(
         )
         self.pooler = BertPooler(self.config) if add_pooling_layer else None
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
 
     def forward(
@@ -714,8 +714,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loaded_params = loader.load_weights(weights)
         return loaded_params
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.new.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.new.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 2986a72f2e48..f71b9c01d359 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -630,7 +630,7 @@ def _process_image_input(self, image_input: Blip2ImageInputs) -> torch.Tensor:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 18b09ee43b7b..00fba93423d8 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -271,7 +271,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.word_embeddings(input_ids)
 
     def forward(
@@ -285,7 +285,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states = self.word_embeddings_layernorm(hidden_states)
         else:
             assert intermediate_tensors is not None
@@ -353,8 +353,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 54ff6991fa70..fb7476c45fcd 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -886,7 +886,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def get_image_tokens(self, pixel_values: torch.Tensor) -> torch.Tensor:
@@ -912,7 +912,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -963,9 +963,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = ChameleonModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -973,9 +973,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head.weight = self.model.embed_tokens.weight
 
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
@@ -1000,7 +998,7 @@ def _parse_and_validate_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -1008,7 +1006,7 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
         image_tokens = self.model.get_image_tokens(
             image_input["data"].to(self.config.dtype)
         )
-        vision_embeddings = self.model.get_input_embeddings(image_tokens)
+        vision_embeddings = self.model.embed_input_ids(image_tokens)
         return vision_embeddings
 
     def forward(
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index bcbe82b78c3b..5d6f5e9125a2 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -353,7 +353,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.encoder.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embedding(input_ids)
 
     def forward(
@@ -368,7 +368,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -433,10 +433,9 @@ def __init__(
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
-        self.lora_config = lora_config
         self.multimodal_config = multimodal_config
 
         self.quant_config = quant_config
@@ -452,8 +451,8 @@ def __init__(
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def compute_logits(
         self,
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 27953c27188d..5d611deb942d 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -20,6 +20,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -315,7 +316,7 @@ def __init__(self, config: CLIPVisionConfig):
 
         self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
@@ -561,7 +562,7 @@ def __init__(
             eps=config.layer_norm_eps,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings.token_embedding(input_ids)
 
     def forward(
@@ -842,7 +843,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             }
         )
 
-        # Assumes that self.forward is called after self.get_input_embeddings
+        # Assumes that self.forward is called after self.embed_input_ids
         self._is_text_input = True
 
     def get_text_features(
@@ -903,7 +904,7 @@ def _process_image_inputs(self, inputs: CLIPImagePixelInputs) -> torch.Tensor:
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -917,16 +918,16 @@ def get_input_embeddings(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index 19cc31c9bd18..139ccba9df6d 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -439,7 +439,7 @@ def _patch_quant_config(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 75459601f76b..77bb17851981 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -288,17 +288,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.quant_config = quant_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size, config.hidden_size
         )
@@ -316,7 +311,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -330,7 +325,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -424,17 +419,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         # currently all existing command R models have `tie_word_embeddings`
         # enabled
         assert config.tie_word_embeddings
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.quant_config = quant_config
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, scale=config.logit_scale
+            config.vocab_size, scale=config.logit_scale
         )
         self.model = CohereModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
@@ -443,8 +436,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     @torch.no_grad()
     def forward(
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 22095d05848c..528ef4f76742 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -25,7 +25,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -355,7 +354,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.d_model
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -369,7 +368,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors
             hidden_states = intermediate_tensors["hidden_states"]
@@ -441,27 +440,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if config.tie_word_embeddings:
             raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
         self.quant_config = quant_config
-        self.unpadded_vocab_size = config.vocab_size
+
         self.transformer = DbrxModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
         )
         self.lm_head = ParallelLMHead(
             config.vocab_size,
             config.d_model,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index fd2f20ea501d..3fb04c3b70dd 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -26,7 +26,7 @@
 )
 from vllm.utils import init_logger
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
 logger = init_logger(__name__)
 
@@ -73,7 +73,7 @@ def __init__(
         self.hnorm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
         self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -222,8 +222,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.num_moe_layers = self.config.num_hidden_layers
         self.set_moe_parameters()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -250,6 +250,7 @@ def transform(inputs):
             name, loaded_weight = inputs
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, loaded_weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 26b9c25e6bdb..e028dc497aa6 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -142,7 +142,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -206,8 +206,8 @@ def set_moe_parameters(self):
                 self.moe_layers.append(layer.mlp.experts)
         self.extract_moe_parameters(example_moe)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index 043256752184..8179f916ff41 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -161,7 +161,7 @@ def validate_params(cls, params: SamplingParams):
             )
 
     def is_argmax_invariant(self) -> bool:
-        return True
+        return False
 
     def new_req_logits_processor(
         self,
@@ -557,9 +557,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 63eaf63cc3c4..e8ee9951d611 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -33,6 +33,7 @@
 from torch import nn
 from transformers import DeepseekV2Config, DeepseekV3Config
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
@@ -50,10 +51,6 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_fusion_shared_expert_enabled,
-    is_rocm_aiter_moe_enabled,
-)
 from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -88,7 +85,7 @@
 )
 from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec
 
-from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsEagle, SupportsLoRA, SupportsPP
 from .utils import (
     PPMissingLayer,
     is_pp_missing_parameter,
@@ -294,10 +291,8 @@ def __init__(
             self.physical_expert_start + self.n_local_physical_experts
         )
 
-        if (
-            config.n_shared_experts is None
-            or is_rocm_aiter_fusion_shared_expert_enabled()
-        ):
+        self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        if config.n_shared_experts is None or self.is_rocm_aiter_moe_enabled:
             self.shared_experts = None
         else:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
@@ -330,14 +325,14 @@ def __init__(
             # we do scaling outside, set factor to 1.0 to avoid double mul
             # aiter applies routed_scaling_factor internally
             routed_scaling_factor=1.0
-            if not is_rocm_aiter_moe_enabled()
+            if not self.is_rocm_aiter_moe_enabled
             else self.routed_scaling_factor,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
             n_shared_experts=config.n_shared_experts
-            if is_rocm_aiter_fusion_shared_expert_enabled()
+            if rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
             else None,
         )
 
@@ -371,7 +366,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # Fix FP16 overflow
         # See DeepseekV2DecoderLayer for more details.
         if hidden_states.dtype != torch.float16:
-            if not is_rocm_aiter_moe_enabled():
+            if not self.is_rocm_aiter_moe_enabled:
                 final_hidden_states *= self.routed_scaling_factor
         elif self.shared_experts is not None:
             assert shared_output is not None
@@ -1241,7 +1236,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -1255,7 +1250,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -1316,7 +1311,7 @@ def update_physical_experts_metadata(
 
 
 class DeepseekV2ForCausalLM(
-    nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA
+    nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA, SupportsEagle
 ):
     packed_modules_mapping = {
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -1394,8 +1389,8 @@ def set_moe_parameters(self):
 
         self.extract_moe_parameters(example_moe)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -1428,6 +1423,9 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         )
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        rocm_aiter_moe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
@@ -1456,7 +1454,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             num_experts=self.config.n_routed_experts
             + (
                 self.config.n_shared_experts
-                if is_rocm_aiter_fusion_shared_expert_enabled()
+                if rocm_aiter_moe_shared_expert_enabled
                 else 0
             ),
             num_redundant_experts=self.num_redundant_experts,
@@ -1472,9 +1470,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             if spec_layer is not None:
                 continue  # skip spec decode layers for main model
 
-            is_fuse_shared_experts_layer = (
-                is_rocm_aiter_fusion_shared_expert_enabled()
-                and ("mlp.shared_experts" in name)
+            is_fuse_shared_experts_layer = rocm_aiter_moe_shared_expert_enabled and (
+                "mlp.shared_experts" in name
             )
 
             for param_name, weight_name, shard_id in stacked_params_mapping:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 306eef3dca99..e7b48e0f4e55 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -619,7 +619,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 15caa3184581..d24da0c42a25 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -398,7 +398,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -412,7 +412,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -541,8 +541,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 6d462ad8ae62..f46caaa095c6 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -9,7 +9,7 @@
 from torch.nn import LayerNorm
 from transformers.models.qwen2_vl import Qwen2VLProcessor
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
@@ -39,8 +39,8 @@
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
-from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention
 from vllm.model_executor.models.qwen2_vl import (
+    Qwen2VisionAttention,
     Qwen2VLDummyInputsBuilder,
     Qwen2VLMultiModalProcessor,
     Qwen2VLProcessingInfo,
@@ -256,7 +256,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -303,17 +303,17 @@ def __init__(
             )
         )
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Unsupported vision attention backend: {self.attn_backend}"
             )
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def forward(
@@ -328,7 +328,7 @@ def forward(
         # [S, C] -> [S, B=1, C]
         x = hidden_states.unsqueeze(1)
         x, _ = self.qkv(x)
-        q, k, v = Qwen2_5_VisionAttention.split_qkv(self, x)
+        q, k, v = Qwen2VisionAttention.split_qkv(self, x)
         bs = q.shape[1]
         # [S,B,H,D] -> [B,S,H,D]
         q = q.permute(1, 0, 2, 3).contiguous()
@@ -361,7 +361,7 @@ def forward(
                 self.num_attention_heads_per_partition,
                 self.hidden_size_per_attention_head,
             )
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             outputs = []
             for i in range(1, len(cu_seqlens)):
                 s = int(cu_seqlens[i - 1])
@@ -373,7 +373,7 @@ def forward(
                 out_i = out_i.permute(0, 2, 1, 3)
                 outputs.append(out_i)
             context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -514,7 +514,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
@@ -567,7 +567,7 @@ def __init__(
         require_post_norm: bool | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.config = config
@@ -582,10 +582,11 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
         self.out_hidden_size = config.hidden_size
         # Keep blocks for compatibility with other vision towers
         num_layers = (
@@ -666,11 +667,11 @@ def compute_attn_mask_seqlen(
     ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if (
-            self.attn_backend == _Backend.FLASH_ATTN
-            or self.attn_backend == _Backend.ROCM_AITER_FA
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
@@ -779,6 +780,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             architectures=["Qwen2ForCausalLM"],
         )
 
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
     def _parse_and_validate_image_input(
         self, **kwargs: object
     ) -> DotsOCRImageInputs | None:
@@ -839,7 +844,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -857,8 +862,8 @@ def forward(
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=input_ids == self.config.image_token_id,
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index b35666175ea7..f2999968669f 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -465,7 +465,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -479,7 +479,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -726,8 +726,8 @@ def update_physical_experts_metadata(
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 7c1eba103ae7..daa5bf03ea4a 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -34,9 +34,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
@@ -58,6 +58,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
@@ -164,7 +165,7 @@ def __init__(
         projection_size: int,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
@@ -211,17 +212,17 @@ def __init__(
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Ernie45-VL does not support {self.attn_backend} backend now."
             )
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
@@ -291,7 +292,7 @@ def forward(
             context_layer = rearrange(
                 output, "(b s) h d -> s b (h d)", b=batch_size
             ).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
             for i in range(1, len(cu_seqlens)):
@@ -310,7 +311,7 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -370,7 +371,7 @@ def __init__(
         norm_layer: Callable[[int], nn.Module] | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -463,7 +464,7 @@ def __init__(
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         patch_size = vision_config.patch_size
@@ -515,10 +516,11 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -565,11 +567,11 @@ def compute_attn_mask_seqlen(
     ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if (
-            self.attn_backend == _Backend.FLASH_ATTN
-            or self.attn_backend == _Backend.ROCM_AITER_FA
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
@@ -1432,17 +1434,16 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for Ernie VL."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
 
+        hf_config = self.config
         image_token_id = hf_config.im_patch_id
         video_start_token_id = hf_config.video_start_token_id
         video_end_token_id = hf_config.video_end_token_id
@@ -1450,10 +1451,7 @@ def get_mrope_input_positions(
         temporal_conv_size = hf_config.temporal_conv_size
         llm_pos_ids_list: list = []
 
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
+        if image_grid_thw or video_grid_thw:
             input_token_type: list[str] = []
             video_check_flg = False
             for token in input_tokens:
@@ -1485,11 +1483,7 @@ def get_mrope_input_positions(
                     llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                 )
                 if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
+                    t, h, w = image_grid_thw[mm_data_idx]
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
                         h // spatial_conv_size,
@@ -1520,11 +1514,7 @@ def get_mrope_input_positions(
                     mm_data_idx += 1
 
                 elif modality_type == "video":
-                    t, h, w = (
-                        video_grid_thw[mm_data_idx][0],
-                        video_grid_thw[mm_data_idx][1],
-                        video_grid_thw[mm_data_idx][2],
-                    )
+                    t, h, w = video_grid_thw[mm_data_idx]
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t // temporal_conv_size,
                         h // spatial_conv_size,
@@ -1569,7 +1559,6 @@ def get_mrope_input_positions(
             llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
@@ -1667,9 +1656,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
 
         return modalities
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -1692,7 +1679,7 @@ def get_multimodal_embeddings(
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1705,9 +1692,9 @@ def get_input_embeddings(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index d002d1838c8e..e8ef86f9b7f0 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -561,7 +561,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -577,7 +577,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -642,8 +642,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py
index e7036840388c..1b9abc3572a3 100644
--- a/vllm/model_executor/models/ernie_mtp.py
+++ b/vllm/model_executor/models/ernie_mtp.py
@@ -112,7 +112,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -160,8 +160,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 84fb52d13854..6c56bfc433c7 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -48,7 +48,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -323,16 +322,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.wte = config.vocab_size
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
@@ -340,7 +334,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.wte = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -364,7 +357,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -378,7 +371,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -489,10 +482,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.transformer = ExaoneModel(
@@ -500,18 +492,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -520,7 +503,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -529,8 +512,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index d5e4d9a1486f..b89e168ada20 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -44,7 +44,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -311,23 +310,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -351,7 +344,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -365,7 +358,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -476,10 +469,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
         self.quant_config = quant_config
 
         self.model = Exaone4Model(
@@ -487,18 +478,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -507,7 +489,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -516,8 +498,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 1b9c7da33490..85acdff3d96b 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -399,7 +399,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.word_embeddings(input_ids)
 
     def forward(
@@ -413,7 +413,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.h, self.start_layer, self.end_layer):
@@ -515,8 +515,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index ac5846cfd869..3653425b8e1c 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -30,7 +30,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -424,21 +423,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank:
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
             )
             self.embedding_multiplier = config.embedding_multiplier
         else:
@@ -468,7 +461,7 @@ def get_layer(prefix: str):
         else:
             self.final_layernorm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -483,7 +476,7 @@ def forward(
                 hidden_states = inputs_embeds * self.embedding_multiplier
             else:
                 hidden_states = (
-                    self.get_input_embeddings(input_ids) * self.embedding_multiplier
+                    self.embed_input_ids(input_ids) * self.embedding_multiplier
                 )
         else:
             assert intermediate_tensors is not None
@@ -572,7 +565,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         self.quant_config = vllm_config.quant_config
@@ -584,21 +577,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         self.tie_word_embeddings = config.tie_word_embeddings
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
             self.lm_head_multiplier = config.lm_head_multiplier
@@ -607,7 +590,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             # Used to track and store by the Mamba cache between steps.
 
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.vocab_size,
                 scale=config.lm_head_multiplier,
             )
@@ -618,8 +601,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 005fac4b1f05..269c36ab5b9c 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -333,7 +333,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 46b111f4d939..7aaae7c503b5 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -293,7 +293,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -307,7 +307,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.normalizer
             residual = None
         else:
@@ -382,12 +382,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
         assert config.tie_word_embeddings
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = GemmaModel(
@@ -398,8 +396,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 1938efd4895e..4d5d6cbb37c6 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -290,7 +290,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -304,7 +304,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.normalizer
             residual = None
         else:
@@ -393,8 +393,7 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        del lora_config  # Unused.
+
         super().__init__()
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
@@ -410,8 +409,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 80ec40f478c6..357e61a4e78b 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -393,7 +393,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         # NOTE(woosuk): Only apply the normalizer to the output of
         # vocab embedding. Don't apply it to the vision embedding.
         return self.embed_tokens(input_ids) * self.normalizer
@@ -410,7 +410,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -524,8 +524,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        del lora_config  # Unused.
+
         super().__init__()
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
@@ -541,8 +540,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 8e1dbd9e2cea..fe83c8b63b01 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -603,6 +603,26 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
 
         return self._process_image_input(image_input)
 
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = True,
+    ) -> torch.Tensor:
+        # Early return for text-only inference (no multimodal data)
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        # Use interface default with OOV handling enabled
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -624,6 +644,79 @@ def forward(
 
         return hidden_states
 
+    def generate_attention_masks(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mask_dtype: torch.dtype,
+    ) -> dict[str, Any]:
+        """Generate custom attention masks for Gemma3 multimodal inputs.
+
+        This is called by V1 engine's gpu_model_runner during preprocessing
+        to generate attention masks that allow bidirectional attention between
+        image tokens while maintaining causal attention for text.
+        """
+        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
+        # This is a HACK. Fix this.
+        start_indices = (positions == 0).cpu().nonzero()
+        num_seqs = len(start_indices)
+        seq_lens = []
+        for i in range(num_seqs):
+            start_idx = start_indices[i]
+            end_idx = start_indices[i + 1] if i < num_seqs - 1 else len(input_ids)
+            seq_lens.append(end_idx - start_idx)
+
+        global_attn_masks = []
+        local_attn_masks = []
+        start_idx = 0
+        for seq_idx, seq_len in enumerate(seq_lens):
+            end_idx = start_idx + seq_len
+            input_token_ids = input_ids[start_idx:end_idx]
+
+            # Find image token positions
+            img_pos = input_token_ids == self.config.image_token_index
+
+            start_idx = end_idx
+
+            # Create a global causal mask
+            global_attn_mask = torch.empty(
+                1,
+                1,
+                seq_len,
+                seq_len,
+                dtype=mask_dtype,
+                device=input_ids.device,
+            )
+            global_attn_mask.fill_(float("-inf"))
+            # Fill the lower triangle with 0 (causal attention)
+            global_attn_mask = global_attn_mask.triu(diagonal=1)
+
+            # Enable bidirectional attention between image tokens
+            img_mask = torch.zeros_like(global_attn_mask)
+            img_mask[:, :, :, img_pos] += 1
+            img_mask[:, :, img_pos, :] += 1
+            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
+            global_attn_masks.append(global_attn_mask)
+
+            # GGUF compatibility: config might be Gemma3TextConfig directly
+            text_config = getattr(self.config, "text_config", self.config)
+            sliding_window = text_config.sliding_window
+            if sliding_window is not None:
+                # Create a local causal mask with sliding window (1024)
+                local_attn_mask = torch.ones_like(global_attn_mask)
+                local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
+                local_attn_mask = torch.where(
+                    local_attn_mask == 0, global_attn_mask, float("-inf")
+                )
+                local_attn_masks.append(local_attn_mask)
+
+        return {
+            "has_images": True,
+            "seq_lens": seq_lens,
+            "global_attn_masks": global_attn_masks,
+            "local_attn_masks": local_attn_masks,
+        }
+
     def prepare_attn_masks(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 547884f393eb..64443190f53e 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -685,7 +685,7 @@ def get_per_layer_inputs(
             per_layer_inputs = per_layer_projection
         return per_layer_inputs
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids) * self.embed_scale
 
     def altup_embed(self, hidden_states_0: torch.Tensor) -> torch.Tensor:
@@ -712,7 +712,7 @@ def forward(
         if inputs_embeds is not None:
             hidden_states_0 = inputs_embeds
         else:
-            hidden_states_0 = self.get_input_embeddings(input_ids)
+            hidden_states_0 = self.embed_input_ids(input_ids)
 
         adjusted_per_layer_inputs = self.get_per_layer_inputs(
             hidden_states_0, per_layer_inputs
@@ -881,8 +881,8 @@ def embed_tokens(self):
     def get_per_layer_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.self_decoder.get_per_layer_input_embeddings(input_ids)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.self_decoder.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.self_decoder.embed_input_ids(input_ids)
 
     def fast_prefill_forward(
         self,
@@ -1114,8 +1114,7 @@ class Gemma3nForCausalLM(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
-        del lora_config  # Unused.
+
         super().__init__()
         self.config = config
         self.cache_config = vllm_config.cache_config
@@ -1126,8 +1125,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.vocab_size, soft_cap=config.final_logit_softcapping
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 2b727a538bf2..6ae76976eb46 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -645,7 +645,7 @@ def _process_audio_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if mm_input_by_modality is None:
             return []
@@ -664,7 +664,7 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
                 multimodal_embeddings.extend(audio_embeddings)
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
@@ -689,9 +689,9 @@ def get_input_embeddings(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
@@ -709,10 +709,10 @@ def forward(
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE (NickLucche) During profiling, `get_input_embeddings` is not
+        # NOTE (NickLucche) During profiling, `embed_input_ids` is not
         # called, hence we don't have input_ids to compute PLEs. We simply
         # select a chunk of pre-allocated PLEs. During normal execution,
-        # `get_input_embeddings` is called before forward, hence this slice
+        # `embed_input_ids` is called before forward, hence this slice
         # will contain PLEs computed from the actual input_ids.
         per_layer_inputs = self.per_layer_embeddings[: inputs_embeds.shape[0]]
 
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index d7fd2b109d24..faa0674a2e43 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -248,10 +248,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = Glm4Model(
@@ -277,8 +275,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 121e84469c52..65c3fc2d9e97 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -37,7 +37,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
 from transformers.models.glm4v.image_processing_glm4v import (
     Glm4vImageProcessor,
@@ -46,7 +46,7 @@
 from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor
 from transformers.video_utils import VideoMetadata
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
@@ -56,20 +56,22 @@
 from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
     QKVParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     VideoItem,
@@ -102,7 +104,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -252,7 +253,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
@@ -306,18 +307,18 @@ def __init__(
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"GLM-4V does not support {self.attn_backend} backend now."
             )
 
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
@@ -341,7 +342,8 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
         seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
@@ -353,10 +355,12 @@ def forward(
         batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v))
-        if rotary_pos_emb is not None:
+        if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
             # [2 * b, s, heads, head_dim]
             qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
+            qk_rotated = apply_rotary_pos_emb_vision(
+                qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin
+            )
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
@@ -377,7 +381,7 @@ def forward(
             context_layer = rearrange(
                 output, "(b s) h d -> s b (h d)", b=batch_size
             ).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             outputs = []
             for i in range(1, len(cu_seqlens)):
@@ -396,7 +400,7 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -425,7 +429,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -454,14 +458,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
         seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -485,15 +491,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=True,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -657,44 +666,6 @@ def forward(
         return embeddings
 
 
-class Glm4vVisionRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int, theta: float = 10000.0) -> None:
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self._seq_len_cached = 0
-        self._freqs_cached = None
-
-    def update_freqs_cache(self, seqlen: int) -> None:
-        if seqlen > self._seq_len_cached:
-            seqlen *= 2
-            self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (
-                self.theta
-                ** (
-                    torch.arange(
-                        0,
-                        self.dim,
-                        2,
-                        dtype=torch.float,
-                        device=self.inv_freq.device,
-                    )
-                    / self.dim
-                )
-            )
-            seq = torch.arange(
-                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.outer(seq, self.inv_freq)
-            self._freqs_cached = freqs
-
-    def forward(self, seqlen: int) -> torch.Tensor:
-        self.update_freqs_cache(seqlen)
-        return self._freqs_cached[:seqlen]
-
-
 class Glm4vVisionTransformer(nn.Module):
     def __init__(
         self,
@@ -703,7 +674,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -728,7 +699,13 @@ def __init__(
 
         norm_layer = partial(RMSNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
         self.blocks = nn.ModuleList(
             [
                 Glm4vVisionBlock(
@@ -772,10 +749,11 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -785,7 +763,9 @@ def dtype(self) -> torch.dtype:
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
-    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+    def rot_pos_emb(
+        self, grid_thw: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         pos_ids = []
         for t, h, w in grid_thw:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
@@ -813,9 +793,18 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb, pos_ids
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        return cos_combined, sin_combined, pos_ids
 
     def compute_attn_mask_seqlen(
         self,
@@ -824,8 +813,8 @@ def compute_attn_mask_seqlen(
         max_seqlen, seqlens = None, None
         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         if (
-            self.attn_backend == _Backend.FLASH_ATTN
-            or self.attn_backend == _Backend.ROCM_AITER_FA
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         return max_seqlen, seqlens
@@ -844,7 +833,9 @@ def forward(
         x = self.post_conv_layernorm(x)
 
         # compute position embedding
-        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb_cos, rotary_pos_emb_sin, image_type_ids = self.rot_pos_emb(
+            grid_thw
+        )
         # compute cu_seqlens
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
@@ -863,7 +854,8 @@ def forward(
             x = blk(
                 x,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
                 seqlens=seqlens,
             )
@@ -891,9 +883,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -1592,9 +1581,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
@@ -1618,27 +1605,23 @@ def get_multimodal_embeddings(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: "PretrainedConfig",
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for GLM4V."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_start_token_id = hf_config.video_start_token_id
         video_end_token_id = hf_config.video_end_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
         llm_pos_ids_list: list = []
 
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
+        if image_grid_thw or video_grid_thw:
             input_token_type: list[str] = []
             video_check_flg = False
             for token in input_tokens:
@@ -1670,11 +1653,7 @@ def get_mrope_input_positions(
                     llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                 )
                 if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
+                    t, h, w = image_grid_thw[mm_data_idx]
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
                         h // spatial_merge_size,
@@ -1707,8 +1686,7 @@ def get_mrope_input_positions(
                 elif modality_type == "video":
                     t, h, w = (
                         video_frame_num,
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
+                        *image_grid_thw[mm_data_idx][1:],
                     )
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
@@ -1754,7 +1732,6 @@ def get_mrope_input_positions(
             llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index b30bd66161da..1422dbe9b3cd 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -455,7 +455,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -469,7 +469,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -704,8 +704,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.extract_moe_parameters(example_moe)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index 9a2ae3c476f0..e34ae6c85a4f 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -149,7 +149,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -211,8 +211,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 self.moe_layers.append(layer.mlp.experts)
         self.extract_moe_parameters(example_moe)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -257,10 +257,16 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
-            if spec_layer is None:
-                continue
-            name = self._rewrite_spec_layer_name(spec_layer, name)
+            if name == "lm_head.weight":
+                spec_layer = self.model.mtp_start_layer_idx
+                name = f"model.layers.{spec_layer}.shared_head.head.weight"
+            elif name == "model.embed_tokens.weight":
+                spec_layer = self.model.mtp_start_layer_idx
+            else:
+                spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+                if spec_layer is None:
+                    continue
+                name = self._rewrite_spec_layer_name(spec_layer, name)
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 2de1e4810952..1c18ea0745f2 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -15,7 +15,7 @@
 from torch.nn import LayerNorm
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType
+from transformers import BatchFeature, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
@@ -36,6 +36,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
@@ -622,27 +623,23 @@ def _process_image_input(self, image_input: GLMVImagePixelInputs) -> torch.Tenso
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for GLM4V."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_start_token_id = hf_config.video_start_token_id
         video_end_token_id = hf_config.video_end_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
         llm_pos_ids_list: list = []
 
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
+        if image_grid_thw or video_grid_thw:
             input_token_type: list[str] = []
             video_check_flg = False
             for token in input_tokens:
@@ -674,11 +671,7 @@ def get_mrope_input_positions(
                     llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                 )
                 if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
+                    t, h, w = image_grid_thw[mm_data_idx]
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
                         h // spatial_merge_size,
@@ -711,8 +704,7 @@ def get_mrope_input_positions(
                 elif modality_type == "video":
                     t, h, w = (
                         video_frame_num,
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
+                        *image_grid_thw[mm_data_idx][1:],
                     )
                     llm_grid_t, llm_grid_h, llm_grid_w = (
                         t,
@@ -758,16 +750,15 @@ def get_mrope_input_positions(
             llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
     def get_language_model(self) -> torch.nn.Module:
         return self.transformer
 
-    get_input_embeddings = SupportsMultiModal.get_input_embeddings
+    embed_input_ids = SupportsMultiModal.embed_input_ids
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 6d99d02a32be..a5e8131c7fba 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -213,7 +213,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.n_embd
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -225,7 +225,7 @@ def forward(
     ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             position_embeds = self.wpe(position_ids)
             hidden_states = inputs_embeds + position_embeds
         else:
@@ -293,8 +293,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -365,8 +365,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index f2c8e2aeb822..cdf038ba25c9 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -207,18 +207,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         assert not config.add_cross_attention
 
         self.embed_dim = config.hidden_size
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.wte = VocabParallelEmbedding(
             self.vocab_size, self.embed_dim, org_num_embeddings=config.vocab_size
         )
@@ -235,7 +230,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.n_embd
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -247,7 +242,7 @@ def forward(
     ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             hidden_states = inputs_embeds + self.wpe(position_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
@@ -290,10 +285,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.transformer = GPTBigCodeModel(
@@ -305,21 +298,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = ParallelLMHead(
                 self.transformer.vocab_size,
                 self.transformer.embed_dim,
-                org_num_embeddings=self.config.vocab_size,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index e04b2465e54a..e416ecde0c1e 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -215,7 +215,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.n_embd
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -229,7 +229,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.h, self.start_layer, self.end_layer):
@@ -319,8 +319,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index e6c145602d29..af0c9209231c 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -229,7 +229,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_in(input_ids)
 
     def forward(
@@ -243,7 +243,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for layer in islice(self.layers, self.start_layer, self.end_layer):
@@ -317,8 +317,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.gpt_neox.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.gpt_neox.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.gpt_neox.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 04038ae74882..7df3b087ccb8 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -92,7 +92,7 @@ def __init__(
         self.scaling = self.head_dim**-0.5
         self.rope_theta = config.rope_theta
 
-        self.qkv = QKVParallelLinear(
+        self.qkv_proj = QKVParallelLinear(
             hidden_size=self.hidden_size,
             head_size=self.head_dim,
             total_num_heads=self.num_attention_heads,
@@ -129,7 +129,7 @@ def __init__(
     def forward(
         self, hidden_states: torch.Tensor, positions: torch.Tensor
     ) -> torch.Tensor:
-        qkv, _ = self.qkv(hidden_states)
+        qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
         v = v.contiguous()
@@ -198,6 +198,7 @@ class TransformerBlock(torch.nn.Module):
     def __init__(
         self,
         vllm_config: VllmConfig,
+        quant_config: QuantizationConfig,
         prefix: str = "",
     ):
         super().__init__()
@@ -207,7 +208,10 @@ def __init__(
 
         self.layer_idx = extract_layer_index(prefix)
         self.attn = OAIAttention(
-            config, prefix=f"{prefix}.attn", cache_config=cache_config
+            config,
+            prefix=f"{prefix}.attn",
+            quant_config=quant_config,
+            cache_config=cache_config,
         )
         self.mlp = MLPBlock(vllm_config, self.layer_idx, prefix=f"{prefix}.mlp")
         self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
@@ -243,6 +247,7 @@ def __init__(
     ):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
+        self.quant_config = vllm_config.quant_config
         self.parallel_config = vllm_config.parallel_config
         self.config.hidden_size = self.config.hidden_size
         self.embedding = VocabParallelEmbedding(
@@ -254,6 +259,7 @@ def __init__(
             lambda prefix: TransformerBlock(
                 vllm_config,
                 prefix=prefix,
+                quant_config=self.quant_config,
             ),
             prefix=f"{prefix}.layers",
         )
@@ -263,7 +269,7 @@ def __init__(
         )
         self.aux_hidden_state_layers = tuple[int, ...]()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embedding(input_ids)
 
     def forward(
@@ -277,7 +283,7 @@ def forward(
             if inputs_embeds is not None:
                 x = inputs_embeds
             else:
-                x = self.get_input_embeddings(input_ids)
+                x = self.embed_input_ids(input_ids)
 
             residual = None
         else:
@@ -488,8 +494,8 @@ def _load_weights_mxfp4(
 
     def _load_weights_other(
         self,
-        ep_rank_start: int,
         ep_rank_end: int,
+        ep_rank_start: int,
         heads_per_rank: int,
         head_start: int,
         weights: Iterable[tuple[str, torch.Tensor]],
@@ -600,9 +606,9 @@ def _load_weights_other(
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            (".qkv", ".q_proj", "q"),
-            (".qkv", ".k_proj", "k"),
-            (".qkv", ".v_proj", "v"),
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
         ]
 
         tp_rank = get_tensor_model_parallel_rank()
@@ -635,8 +641,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             )
         else:
             return self._load_weights_other(
-                ep_rank_end,
                 ep_rank_start,
+                ep_rank_end,
                 heads_per_rank,
                 head_start,
                 weights,
@@ -645,7 +651,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
 
 class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
-    packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]}
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
@@ -697,8 +703,8 @@ def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 5fc8718ca75e..c44b4021471e 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -318,7 +318,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -332,7 +332,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
 
             hidden_states *= self.config.embedding_multiplier
         else:
@@ -473,8 +473,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 3ddf02bbba2e..1797adab8d14 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -767,7 +767,7 @@ def _process_audio_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
+    def embed_multimodal(
         self,
         **kwargs: object,
     ) -> MultiModalEmbeddings:
@@ -779,7 +779,7 @@ def get_multimodal_embeddings(
         audio_features = self._process_audio_input(audio_input)
         return audio_features
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -790,9 +790,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index e683f30805f3..5c6759ded066 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -50,7 +50,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -296,22 +295,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config  # Required by MixtralModel
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.embedding_multiplier = config.embedding_multiplier
 
@@ -323,7 +315,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -337,7 +329,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.embedding_multiplier
         else:
             assert intermediate_tensors is not None
@@ -518,26 +510,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.model = GraniteMoeModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -545,13 +527,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head.weight = self.model.embed_tokens.weight
 
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size,
             config.vocab_size,
             scale=1 / self.config.logits_scaling,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index bac64eec8c55..05177f1d1ac2 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -25,7 +25,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -334,22 +333,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.embedding_multiplier = config.embedding_multiplier
 
@@ -374,7 +366,7 @@ def get_layer(prefix: str):
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -388,7 +380,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
                 hidden_states = hidden_states * self.embedding_multiplier
             residual = None
         else:
@@ -605,6 +597,9 @@ class GraniteMoeHybridForCausalLM(
             "k_proj",
             "v_proj",
         ],
+        "conv1d": ["conv1d"],
+        "in_proj": ["in_proj"],
+        "input_linear": ["input_linear"],
     }
     embedding_modules = {
         "embed_tokens": "input_embeddings",
@@ -658,7 +653,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
         self.quant_config = vllm_config.quant_config
         self.config = config
@@ -666,26 +661,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = GraniteMoeHybridModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
 
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=self.quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.vocab_size,
             scale=1 / self.config.logits_scaling,
         )
@@ -694,8 +680,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index e222109f2a94..926c539af33b 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -25,7 +25,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -159,23 +158,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config  # Required by MixtralModel
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             quant_config=quant_config,
         )
         self.embedding_multiplier = config.embedding_multiplier
@@ -190,7 +182,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -204,7 +196,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             hidden_states *= self.embedding_multiplier
         else:
             assert intermediate_tensors is not None
@@ -281,26 +273,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.model = GraniteMoeSharedModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -308,13 +290,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head.weight = self.model.embed_tokens.weight
 
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.vocab_size,
             scale=1 / self.config.logits_scaling,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index d77a0bc2993a..9dc231863f74 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -45,7 +45,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -305,18 +304,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.embedding_multiplier_scale = getattr(
             config, "embedding_multiplier_scale", DEFAULT_EMBEDDING_MULTIPLIER_SCALE
         )
@@ -324,7 +318,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             quant_config=quant_config,
         )
 
@@ -341,7 +334,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
         hidden_states = hidden_states * self.embedding_multiplier_scale
         return hidden_states
@@ -357,7 +350,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -499,25 +492,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = Grok1Model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -529,15 +515,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config, "output_multiplier_scale", DEFAULT_OUTPUT_MULTIPLIER_SCALE
         )
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, self.output_multiplier_scale
+            config.vocab_size, scale=self.output_multiplier_scale
         )
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 8fa9776bd018..1eadcbe67ade 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -57,7 +57,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -606,7 +605,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         eplb_config = vllm_config.parallel_config.eplb_config
         enable_eplb = vllm_config.parallel_config.enable_eplb
         self.num_redundant_experts = eplb_config.num_redundant_experts
@@ -614,20 +613,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -649,7 +643,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -663,7 +657,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -937,12 +931,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.model = HunYuanModel(vllm_config=vllm_config, prefix="model")
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -951,7 +942,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -996,8 +987,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         )
         return loader.load_weights(weights)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
 
 class HunYuanMoEV1Base(HunyuanV1ModelBase, MixtureOfExperts):
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index 3d28ba951b94..db46353efde5 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -732,7 +732,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
+    def embed_multimodal(
         self,
         **kwargs: object,
     ) -> MultiModalEmbeddings:
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 06ca8c488634..9c5f9389e54b 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -550,8 +550,8 @@ def image_pixels_to_features(
 
         return image_hidden_states
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.text_model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.text_model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -674,7 +674,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index b634c7ec7d67..dc4caf2f02f9 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -16,7 +16,6 @@
 import torch
 import torch.nn as nn
 from torch import Tensor
-from transformers import PretrainedConfig
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
@@ -32,10 +31,12 @@
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.model_executor.models.utils import WeightsMapper
+    from vllm.multimodal.inputs import MultiModalFeatureSpec
     from vllm.sequence import IntermediateTensors
 else:
     VllmConfig = object
     WeightsMapper = object
+    MultiModalFeatureSpec = object
     IntermediateTensors = object
 
 logger = init_logger(__name__)
@@ -93,7 +94,7 @@ def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         """
         ...
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         """
         Returns multimodal embeddings generated from multimodal kwargs
         to be merged with text embeddings.
@@ -103,7 +104,13 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
             the appearances of their corresponding multimodal data item in the
             input prompt.
         """
-        ...
+        if hasattr(self, "get_multimodal_embeddings"):
+            logger.warning_once(
+                "`get_multimodal_embeddings` for vLLM models is deprecated and will be "
+                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
+                "this method to `embed_multimodal`."
+            )
+            return self.get_multimodal_embeddings(**kwargs)
 
     def get_language_model(self) -> VllmModel:
         """
@@ -118,10 +125,10 @@ def get_language_model(self) -> VllmModel:
         ...
 
     @overload
-    def get_input_embeddings(self, input_ids: Tensor) -> Tensor: ...
+    def embed_input_ids(self, input_ids: Tensor) -> Tensor: ...
 
     @overload
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: Tensor,
         multimodal_embeddings: MultiModalEmbeddings,
@@ -130,17 +137,17 @@ def get_input_embeddings(
         handle_oov_mm_token: bool = False,
     ) -> Tensor: ...
 
-    def _get_text_embeddings(
+    def _embed_text_input_ids(
         self,
         input_ids: Tensor,
-        get_input_embeddings: Callable[[Tensor], Tensor],
+        embed_input_ids: Callable[[Tensor], Tensor],
         *,
         is_multimodal: Tensor | None,
         handle_oov_mm_token: bool,
     ) -> Tensor:
         if handle_oov_mm_token and is_multimodal is not None:
             is_text = ~is_multimodal
-            text_embeds = get_input_embeddings(input_ids[is_text])
+            text_embeds = embed_input_ids(input_ids[is_text])
 
             return torch.empty(
                 (input_ids.shape[0], text_embeds.shape[1]),
@@ -148,9 +155,9 @@ def _get_text_embeddings(
                 device=text_embeds.device,
             ).masked_scatter_(is_text.unsqueeze_(-1), text_embeds)
 
-        return get_input_embeddings(input_ids)
+        return embed_input_ids(input_ids)
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -166,15 +173,15 @@ def get_input_embeddings(
 
         In case the multi-modal token IDs exceed the vocabulary size of
         the language model, you can set `handle_oov_mm_token=False`
-        to avoid calling the language model's `get_input_embeddings` method
+        to avoid calling the language model's `embed_input_ids` method
         on those tokens. Note however that doing so increases memory usage
         as an additional buffer is needed to hold the input embeddings.
         """
         from .utils import _merge_multimodal_embeddings
 
-        inputs_embeds = self._get_text_embeddings(
+        inputs_embeds = self._embed_text_input_ids(
             input_ids,
-            self.get_language_model().get_input_embeddings,
+            self.get_language_model().embed_input_ids,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
@@ -184,7 +191,7 @@ def get_input_embeddings(
 
         if is_multimodal is None:
             raise ValueError(
-                "`get_input_embeddings` now requires `is_multimodal` arg, "
+                "`embed_input_ids` now requires `is_multimodal` arg, "
                 "please update your model runner according to "
                 "https://github.com/vllm-project/vllm/pull/16229."
             )
@@ -925,13 +932,73 @@ def supports_transcription(
 
 
 @runtime_checkable
-class SupportsEagle3(Protocol):
+class SupportsEagleBase(Protocol):
+    """Base interface for models that support EAGLE-based speculative decoding."""
+
+    has_own_lm_head: bool = False
+    """
+    A flag that indicates this model has trained its own lm_head.
+    """
+
+    has_own_embed_tokens: bool = False
+    """
+    A flag that indicates this model has trained its own input embeddings.
+    """
+
+
+@overload
+def supports_any_eagle(model: type[object]) -> TypeIs[type[SupportsEagleBase]]: ...
+
+
+@overload
+def supports_any_eagle(model: object) -> TypeIs[SupportsEagleBase]: ...
+
+
+def supports_any_eagle(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagleBase]] | TypeIs[SupportsEagleBase]:
+    """Check if model supports any EAGLE variant (1, 2, or 3)."""
+    return supports_eagle(model) or supports_eagle3(model)
+
+
+@runtime_checkable
+class SupportsEagle(SupportsEagleBase, Protocol):
+    """The interface required for models that support
+    EAGLE-1 and EAGLE-2 speculative decoding."""
+
+    supports_eagle: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports EAGLE-1 and EAGLE-2 
+    speculative decoding.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+
+@overload
+def supports_eagle(model: type[object]) -> TypeIs[type[SupportsEagle]]: ...
+
+
+@overload
+def supports_eagle(model: object) -> TypeIs[SupportsEagle]: ...
+
+
+def supports_eagle(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagle]] | TypeIs[SupportsEagle]:
+    return isinstance(model, SupportsEagle)
+
+
+@runtime_checkable
+class SupportsEagle3(SupportsEagleBase, Protocol):
     """The interface required for models that support
-    EAGLE3 speculative decoding."""
+    EAGLE-3 speculative decoding."""
 
     supports_eagle3: ClassVar[Literal[True]] = True
     """
-    A flag that indicates this model supports EAGLE3 
+    A flag that indicates this model supports EAGLE-3 
     speculative decoding.
 
     Note:
@@ -942,7 +1009,7 @@ class SupportsEagle3(Protocol):
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         """
         Set which layers should output auxiliary
-        hidden states for EAGLE3.
+        hidden states for EAGLE-3.
 
         Args:
             layers: Tuple of layer indices that should output auxiliary
@@ -953,7 +1020,7 @@ def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
     def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         """
         Get the layer indices that should output auxiliary hidden states
-        for EAGLE3.
+        for EAGLE-3.
 
         Returns:
             Tuple of layer indices for auxiliary hidden state outputs.
@@ -991,14 +1058,7 @@ class SupportsMRoPE(Protocol):
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list["MultiModalFeatureSpec"],
     ) -> tuple[torch.Tensor, int]:
         """
         Get M-RoPE input positions and delta value for this specific model.
@@ -1008,19 +1068,11 @@ def get_mrope_input_positions(
 
         Args:
             input_tokens: List of input token IDs
-            hf_config: HuggingFace model configuration
-            image_grid_thw: Image grid dimensions (t, h, w)
-            video_grid_thw: Video grid dimensions (t, h, w)
-            second_per_grid_ts: Seconds per grid timestep for videos
-            context_len: Context length
-            seq_len: Sequence length
-            audio_feature_lengths: Audio feature lengths for multimodal models
-            use_audio_in_video: Whether to use audio in video for interleaving
+            mm_features: Information about each multi-modal data item
 
         Returns:
-            Tuple of (llm_positions, mrope_position_delta)
-            - llm_positions: Tensor of shape [3, num_tokens]
-                with T/H/W positions
+            Tuple of `(llm_positions, mrope_position_delta)`
+            - llm_positions: Tensor of shape `[3, num_tokens]` with T/H/W positions
             - mrope_position_delta: Delta for position calculations
         """
         ...
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index d87a65a47083..4267b6c6598e 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -41,24 +41,19 @@
 class VllmModel(Protocol[T_co]):
     """The interface required for all models in vLLM."""
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None: ...
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: ...
 
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-    ) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Apply token embeddings to `input_ids`."""
-        ...
+        if hasattr(self, "get_input_embeddings"):
+            logger.warning_once(
+                "`get_input_embeddings` for vLLM models is deprecated and will be "
+                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
+                "this method to `embed_input_ids`."
+            )
+            return self.get_input_embeddings(input_ids)
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-    ) -> T_co: ...
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor) -> T_co: ...
 
 
 def _check_vllm_model_init(model: type[object] | object) -> bool:
@@ -66,11 +61,19 @@ def _check_vllm_model_init(model: type[object] | object) -> bool:
     return supports_kw(model_init, "vllm_config")
 
 
-def _check_vllm_model_get_input_embeddings(model: type[object] | object) -> bool:
-    model_get_input_embeddings = getattr(model, "get_input_embeddings", None)
-    if not callable(model_get_input_embeddings):
+def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool:
+    model_embed_input_ids = getattr(model, "embed_input_ids", None)
+    if not callable(model_embed_input_ids):
+        model_get_input_embeddings = getattr(model, "get_input_embeddings", None)
+        if callable(model_get_input_embeddings):
+            logger.warning(
+                "`get_input_embeddings` for vLLM models is deprecated and will be "
+                "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
+                "this method to `embed_input_ids`."
+            )
+            model.embed_input_ids = model_get_input_embeddings
         logger.warning(
-            "The model (%s) is missing the `get_input_embeddings` method.",
+            "The model (%s) is missing the `embed_input_ids` method.",
             model,
         )
         return False
@@ -110,7 +113,7 @@ def is_vllm_model(
 ) -> TypeIs[type[VllmModel]] | TypeIs[VllmModel]:
     return (
         _check_vllm_model_init(model)
-        and _check_vllm_model_get_input_embeddings(model)
+        and _check_vllm_model_embed_input_ids(model)
         and _check_vllm_model_forward(model)
     )
 
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index c5bbd5497a14..60fbeb842dd4 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -284,7 +284,7 @@ def __init__(
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.tok_embeddings(input_ids)
 
     def forward(
@@ -298,7 +298,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -330,11 +330,9 @@ def __init__(
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        self.lora_config = lora_config
 
         self.model = model_type(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
@@ -352,8 +350,8 @@ def __init__(
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index 1f251935a70a..c2195fd0cb88 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -742,7 +742,7 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -765,7 +765,7 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -778,9 +778,9 @@ def get_input_embeddings(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index e2d2647f0177..ccbde115009d 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1344,7 +1344,7 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -1367,7 +1367,7 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1380,9 +1380,9 @@ def get_input_embeddings(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 782ab6f1e2da..5549a1fc1cd3 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -275,7 +275,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.n_embd
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -287,7 +287,7 @@ def forward(
     ) -> IntermediateTensors | torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             if self.wpe is not None:
                 position_embeds = self.wpe(position_ids)
                 hidden_states = inputs_embeds + position_embeds
@@ -339,8 +339,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 0cb993901fd3..3a2c98c73dab 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -30,7 +30,6 @@
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -307,21 +306,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
@@ -348,7 +340,7 @@ def get_layer(prefix: str):
 
         self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -362,7 +354,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -492,7 +484,7 @@ class JambaForCausalLM(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         super().__init__()
@@ -503,31 +495,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = JambaModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 5f8659a3064e..1eb0eccc0411 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -16,7 +16,7 @@
 from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.utils import torch_int
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     maybe_get_vit_flash_attn_backend,
 )
@@ -40,6 +40,7 @@
     ImageItem,
     ModalityData,
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     VideoItem,
@@ -345,6 +346,13 @@ def apply_rotary_pos_emb_flashatt(
         from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
     elif current_platform.is_rocm():
         from flash_attn.ops.triton.rotary import apply_rotary as apply_rotary_emb
+    else:
+        # For other platforms, use PyTorch fallback
+        from vllm.model_executor.layers.rotary_embedding.common import (
+            apply_rotary_emb_torch,
+        )
+
+        apply_rotary_emb = partial(apply_rotary_emb_torch, is_neox_style=True)
 
     q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
     k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
@@ -360,7 +368,7 @@ def __init__(
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -414,17 +422,17 @@ def __init__(
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Keye-VL does not support {self.attn_backend} backend now."
             )
 
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def forward(
@@ -489,7 +497,7 @@ def forward(
                 softmax_scale=self.scale,
             )
             context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -536,7 +544,7 @@ def __init__(
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -590,7 +598,7 @@ def __init__(
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -685,7 +693,7 @@ def __init__(
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -768,7 +776,7 @@ def __init__(
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
@@ -1476,9 +1484,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -1627,18 +1633,17 @@ def _process_video_input(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+
         if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
             video_grid_thw = video_grid_thw[0]
-        """Get mrope input positions and delta value (Keye series)."""
 
         def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
             """
@@ -1664,6 +1669,7 @@ def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
 
         video_grid_thw = split_thw(video_grid_thw)
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
@@ -1693,20 +1699,12 @@ def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
                 ed_video = len(input_tokens) + 1
 
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_index += 1
                 remain_frames -= 1
                 ed = ed_video
@@ -1759,6 +1757,5 @@ def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
index 13e5b2d5f157..124e9c2afa21 100644
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -21,6 +21,7 @@
 from vllm.multimodal.inputs import (
     ImageItem,
     ModalityData,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     VideoItem,
@@ -597,18 +598,17 @@ def _process_video_input(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+
         if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
             video_grid_thw = video_grid_thw[0]
-        """Get mrope input positions and delta value (Keye series)."""
 
         def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
             """
@@ -634,6 +634,7 @@ def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
 
         video_grid_thw = split_thw(video_grid_thw)
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
@@ -663,20 +664,12 @@ def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
                 ed_video = len(input_tokens) + 1
 
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_index += 1
                 remain_frames -= 1
                 ed = ed_video
@@ -729,6 +722,5 @@ def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py
index cce22842d333..f3675075a48f 100644
--- a/vllm/model_executor/models/kimi_linear.py
+++ b/vllm/model_executor/models/kimi_linear.py
@@ -439,7 +439,7 @@ def get_layer(prefix: str):
             "num_attention_heads must be divisible by world_size"
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -454,7 +454,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -504,8 +504,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.config.vocab_size, scale=logit_scale
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index b54f53931d71..8167b82f3233 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -60,7 +60,6 @@
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
 )
 from vllm.model_executor.model_loader.weight_utils import (
@@ -347,13 +346,10 @@ def __init__(
             vllm_config=sub_vllm_config,
             prefix=maybe_prefix(prefix, "language_model"),
         )
-        self.unpadded_vocab_size = config.text_config.vocab_size
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.text_config.hidden_size,
-                org_num_embeddings=self.config.text_config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
         else:
@@ -362,9 +358,7 @@ def __init__(
             self.language_model.make_empty_intermediate_tensors
         )
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
         self.media_placeholder: int = self.config.media_placeholder_token_id
 
     def _parse_and_validate_image_input(
@@ -410,7 +404,7 @@ def _process_image_input(self, image_input: KimiVLImageInputs) -> torch.Tensor:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> NestedTensors | None:
+    def embed_multimodal(self, **kwargs: object) -> NestedTensors | None:
         # Validate the multimodal input keyword arguments
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
@@ -456,7 +450,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
-        if not config.use_mla:
+        use_mha = (
+            config.model_type == "deepseek"
+            or config.qk_nope_head_dim + config.qk_rope_head_dim == 0
+        )
+        if use_mha:
             stacked_params_mapping += [
                 (".qkv_proj", ".q_proj", "q"),
                 (".qkv_proj", ".k_proj", "k"),
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 5684b9a89125..aeb25602f11a 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -28,7 +28,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -316,16 +315,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size
@@ -358,7 +351,7 @@ def get_layer(prefix: str):
         else:
             self.embedding_norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -372,7 +365,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -483,7 +476,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         assert not cache_config.enable_prefix_caching, (
             "Lfm2 currently does not support prefix caching"
         )
@@ -495,21 +488,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = self.config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -517,16 +498,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         else:
             self.lm_head = PPMissingLayer()
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index 02a490e9c7fd..6b7b5564ee98 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -33,7 +33,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -423,20 +422,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         parallel_config = vllm_config.parallel_config
         enable_eplb = parallel_config.enable_eplb
         eplb_config = parallel_config.eplb_config
         self.num_redundant_experts = eplb_config.num_redundant_experts
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size
@@ -472,7 +466,7 @@ def get_layer(prefix: str):
         else:
             self.embedding_norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -486,7 +480,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -662,7 +656,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         assert not cache_config.enable_prefix_caching, (
             "Lfm2Moe currently does not support prefix caching"
         )
@@ -674,21 +668,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = self.config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -696,9 +678,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         else:
             self.lm_head = PPMissingLayer()
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
@@ -734,8 +714,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.num_routed_experts = example_layer.n_routed_experts
         self.num_redundant_experts = example_layer.n_redundant_experts
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def update_physical_experts_metadata(
         self,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 0a08bd376bad..0a3f37c30ab5 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -58,7 +58,7 @@
 )
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -424,7 +424,7 @@ def __init__(
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -438,7 +438,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -529,7 +529,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
+class LlamaForCausalLM(
+    nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
+):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -640,8 +642,8 @@ def _init_model(
     ):
         return LlamaModel(vllm_config=vllm_config, prefix=prefix, layer_type=layer_type)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index b59176191e7a..660c8f1bb522 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -35,7 +35,7 @@
 from vllm.model_executor.models.utils import extract_layer_index
 
 from .interfaces import SupportsMultiModal
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
 logger = init_logger(__name__)
 
@@ -82,7 +82,7 @@ def __init__(
         )
         self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -93,7 +93,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings(input_ids)
+            inputs_embeds = self.embed_input_ids(input_ids)
         hidden_states = self.fc(torch.cat((inputs_embeds, hidden_states), dim=-1))
         residual = None
         for layer in self.layers:
@@ -195,7 +195,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    get_input_embeddings = SupportsMultiModal.get_input_embeddings  # type: ignore
+    embed_input_ids = SupportsMultiModal.embed_input_ids  # type: ignore
 
     def forward(
         self,
@@ -212,6 +212,7 @@ def transform(inputs):
             name, weight = self.permute_qk_weight_for_rotary(name, loaded_weight)
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 3617294bd621..90ab5c50361b 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -11,13 +11,22 @@
 from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import (
+    AutoWeightsLoader,
+    get_draft_quant_config,
+    maybe_prefix,
+    process_eagle_weight,
+)
 
 logger = init_logger(__name__)
 
@@ -40,14 +49,7 @@ def __init__(
 
     def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
         """Use drafter's quantization config instead of verifier's."""
-        draft_model_config = vllm_config.speculative_config.draft_model_config
-        draft_load_config = vllm_config.load_config
-
-        return (
-            VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
-            if draft_model_config
-            else None
-        )
+        return get_draft_quant_config(vllm_config)
 
 
 @support_torch_compile
@@ -63,6 +65,9 @@ def __init__(
         self.config = vllm_config.speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
         self.embed_tokens = VocabParallelEmbedding(
             self.config.vocab_size,
             self.config.hidden_size,
@@ -80,11 +85,17 @@ def __init__(
                 for i in range(self.config.num_hidden_layers)
             ]
         )
-        self.fc = torch.nn.Linear(
-            self.config.hidden_size * 2, self.config.hidden_size, bias=False
+        self.fc = ReplicatedLinear(
+            input_size=self.config.hidden_size * 2,
+            output_size=self.config.hidden_size,
+            bias=False,
+            params_dtype=vllm_config.model_config.dtype,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -117,6 +128,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
+            # Handle kv cache quantization scales
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            # Remapping the name FP8 kv-scale
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -158,8 +187,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.config.vocab_size, scale=logit_scale
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -179,6 +208,7 @@ def transform(inputs):
             name, loaded_weight = inputs
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, loaded_weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index da4bbda186b1..75c671311b49 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -11,20 +11,27 @@
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import QKVParallelLinear
+from vllm.model_executor.layers.linear import QKVParallelLinear, ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import (
+    AutoWeightsLoader,
+    get_draft_quant_config,
+    maybe_prefix,
+    process_eagle_weight,
+)
 
 logger = init_logger(__name__)
 
@@ -67,14 +74,7 @@ def __init__(
 
     def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
         """Use drafter's quantization config instead of verifier's."""
-        draft_model_config = vllm_config.speculative_config.draft_model_config
-        draft_load_config = vllm_config.load_config
-
-        return (
-            VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
-            if draft_model_config
-            else None
-        )
+        return get_draft_quant_config(vllm_config)
 
     def _norm_before_residual(
         self, hidden_states: torch.Tensor
@@ -141,6 +141,9 @@ def __init__(
         self.config = vllm_config.speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
         current_vllm_config = get_current_vllm_config()
 
         self.embed_tokens = VocabParallelEmbedding(
@@ -161,19 +164,25 @@ def __init__(
             ]
         )
         if hasattr(self.config, "target_hidden_size"):
-            self.fc = torch.nn.Linear(
-                self.config.target_hidden_size * 3, self.config.hidden_size, bias=False
-            )
+            fc_input_size = self.config.target_hidden_size * 3
         else:
-            self.fc = torch.nn.Linear(
-                self.config.hidden_size * 3, self.config.hidden_size, bias=False
-            )
+            fc_input_size = self.config.hidden_size * 3
+        self.fc = ReplicatedLinear(
+            input_size=fc_input_size,
+            output_size=self.config.hidden_size,
+            bias=False,
+            params_dtype=vllm_config.model_config.dtype,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
+        )
+
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -184,7 +193,7 @@ def forward(
         input_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if input_embeds is None:
-            input_embeds = self.get_input_embeddings(input_ids)
+            input_embeds = self.embed_input_ids(input_ids)
         assert hidden_states.shape[-1] == input_embeds.shape[-1]
 
         residual = None
@@ -212,6 +221,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         for name, loaded_weight in weights:
             if "midlayer." in name:
                 name = name.replace("midlayer.", "layers.0.")
+            # Handle kv cache quantization scales
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            # Remapping the name FP8 kv-scale
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -252,8 +279,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.lm_head = ParallelLMHead(
             self.config.draft_vocab_size,
             self.config.hidden_size,
-            org_num_embeddings=self.config.draft_vocab_size,
-            padding_size=(DEFAULT_VOCAB_PADDING_SIZE),
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         self.logits_processor = LogitsProcessor(
@@ -264,13 +289,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             requires_grad=False,
         )
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: NestedTensors | None = None,
         is_multimodal: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -327,6 +352,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
             if "embed_tokens" in name:
                 includes_embed_tokens = True
             model_weights[name] = loaded_weight
+            process_eagle_weight(self, name)
 
         skip_substrs = []
         if not includes_draft_id_mapping:
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index a3dea0ce86f8..c1fb2d4f4af7 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -661,7 +661,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 3cf546644d04..98b1b46045c3 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -483,14 +483,14 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -501,9 +501,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 77c331b0182b..902c598c226f 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -422,7 +422,7 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         video_input = self._parse_and_validate_video_input(**kwargs)
         if video_input is None:
             return []
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index c4cae240ea46..322bde94ff66 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -866,7 +866,7 @@ def apply_pooling(self, image_features: torch.Tensor, stride: int = 2):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index 5671347c00a2..5de10e708683 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -498,7 +498,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -512,7 +512,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -554,7 +554,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = FlashConfig(**vllm_config.model_config.hf_config.__dict__)
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         config.intermediate_size = (
@@ -562,7 +561,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             if hasattr(config, "ffn_hidden_size")
             else config.intermediate_size
         )
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = FlashModel(
@@ -584,8 +583,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index f684203f6d35..aa16640a9427 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -21,7 +21,6 @@
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -110,18 +109,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         is_lora_enabled = bool(lora_config)
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embeddings = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
@@ -142,7 +135,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
 
     def forward(
@@ -156,7 +149,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -199,7 +192,7 @@ class MambaForCausalLM(
 ):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.scheduler_config = vllm_config.scheduler_config
 
         super().__init__()
@@ -209,34 +202,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.backbone = MambaModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         if config.tie_word_embeddings:
             self.lm_head = self.backbone.embeddings
         else:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.backbone.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 8ba8af66635b..fc17f98be198 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -20,7 +20,6 @@
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -107,18 +106,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         assert not is_lora_enabled
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embeddings = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
@@ -138,7 +131,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings(input_ids)
 
     def forward(
@@ -152,7 +145,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -238,7 +231,7 @@ def get_mamba_state_shape_from_config(
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         super().__init__()
@@ -249,34 +242,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.backbone = Mamba2Model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
 
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.backbone.embeddings)
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.backbone.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 7e1d2bf14bb5..fd7fc2c73f16 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -9,7 +9,6 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -70,14 +69,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         )
         self.orig_vocab_size = config.vocab_size
         self.truncated_vocab_size = config.truncated_vocab_size
-        self.unpadded_vocab_size = self.truncated_vocab_size
 
         if getattr(config, "original_lm_head", False):
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                self.truncated_vocab_size,
                 config.hidden_size,
-                org_num_embeddings=self.truncated_vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
             self.lm_heads = [self.lm_head for _ in range(self.config.num_heads)]
@@ -85,10 +81,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             self.lm_heads = nn.ModuleList(
                 [
                     ParallelLMHead(
-                        self.unpadded_vocab_size,
+                        config.vocab_size,
                         config.hidden_size,
-                        org_num_embeddings=self.truncated_vocab_size,
-                        padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                         prefix=maybe_prefix(prefix, f"lm_heads.{i}"),
                     )
                     for i in range(self.config.num_heads)
@@ -97,7 +91,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, self.truncated_vocab_size, logit_scale
+            config.vocab_size, self.truncated_vocab_size, logit_scale
         )
 
         # Token map is a idx to token mapping to reduce the vocab size for
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
index 322cce79d4cb..a84c99059cd9 100644
--- a/vllm/model_executor/models/midashenglm.py
+++ b/vllm/model_executor/models/midashenglm.py
@@ -791,7 +791,7 @@ def _process_audio_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.decoder
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
 
         if audio_input is None:
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index 726752a77e0d..cd0a6190e950 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -70,7 +70,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -151,10 +151,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
 
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index 3d7695a2a304..9905f65b74ca 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -120,7 +120,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -164,8 +164,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 85d3542317a1..914b097fe199 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -55,7 +55,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -405,22 +404,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.cache_config = cache_config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.num_experts = getattr(self.config, "num_experts", 0)
         self._init_layers(prefix, config, cache_config, quant_config)
@@ -447,7 +440,7 @@ def _init_layers(
             prefix=f"{prefix}.layers",
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
         return embedding * self.config.scale_emb
 
@@ -462,7 +455,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             hidden_states = intermediate_tensors["hidden_states"]
@@ -588,13 +581,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         parallel_config = vllm_config.parallel_config
 
         self.prefix = prefix
         self.vllm_config = vllm_config
         self.config = config
-        self.lora_config = lora_config
+
         self.cache_config = cache_config
         self.quant_config = quant_config
 
@@ -602,18 +595,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
-        unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -621,7 +605,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
-        self.logits_processor = LogitsProcessor(unpadded_vocab_size, config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
@@ -631,8 +615,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
         return MiniCPMModel(vllm_config=vllm_config, prefix=prefix)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 463af9bbe139..d0cdb70aa857 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -37,14 +37,13 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle, SupportsLoRA, SupportsPP
 from .minicpm import MiniCPMAttention as EagleMiniCPMAttention
 from .minicpm import MiniCPMMLP as EagleMiniCPMMLP
 from .minicpm import MiniCPMMoE as EagleMiniCPMMoE
@@ -53,6 +52,7 @@
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     maybe_prefix,
+    process_eagle_weight,
 )
 
 
@@ -151,18 +151,13 @@ def __init__(
         config = vllm_config.speculative_config.draft_model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.cache_config = cache_config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         self.fc = torch.nn.Linear(
             self.config.hidden_size * 2, self.config.hidden_size, bias=False
         )
@@ -171,7 +166,6 @@ def __init__(
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.num_experts = getattr(self.config, "num_experts", 0)
         self._init_layers(prefix, config, cache_config, quant_config, start_layer)
@@ -200,7 +194,7 @@ def _init_layers(
             ]
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
         return embedding * self.config.scale_emb
 
@@ -210,7 +204,7 @@ def forward(
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor | IntermediateTensors:
-        input_embeds = self.get_input_embeddings(input_ids)
+        input_embeds = self.embed_input_ids(input_ids)
         input_embeds = self.input_norm1(input_embeds)
         hidden_states = self.input_norm2(hidden_states)
 
@@ -296,7 +290,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -321,12 +315,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.speculative_config.draft_model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.prefix = prefix
         self.vllm_config = vllm_config
         self.config = config
-        self.lora_config = lora_config
+
         self.cache_config = cache_config
         self.quant_config = quant_config
 
@@ -340,18 +333,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             start_layer=target_layer_num,
         )
 
-        unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
@@ -359,7 +343,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
-        self.logits_processor = LogitsProcessor(unpadded_vocab_size, config.vocab_size)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
@@ -371,8 +355,8 @@ def _init_model(
             vllm_config=vllm_config, prefix=prefix, start_layer=start_layer
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -393,8 +377,13 @@ def compute_logits(
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        def transform(inputs):
+            name, loaded_weight = inputs
+            process_eagle_weight(self, name)
+            return name, loaded_weight
+
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(map(transform, weights))
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 09937706f8c5..2ac97764dd34 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1139,7 +1139,7 @@ def _process_multimodal_inputs(self, modalities: dict):
     def get_language_model(self) -> torch.nn.Module:
         return self.llm
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index 21ed428a05d0..49d2f2d26196 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -360,7 +360,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -374,7 +374,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -510,8 +510,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index e262012dcd52..bf1ecc822756 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -621,7 +620,7 @@ def _clear_prefill_cache(
             )
             minimax_cache_tensors[:, slots_tensor, ...] = 0
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -669,16 +668,14 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
 
         if not hasattr(config, "sliding_window"):
             config.sliding_window = None
 
         self.CONCAT_FFN = True
 
-        self.unpadded_vocab_size = self.config.vocab_size
         if hasattr(vllm_config.model_config, "max_model_len"):
             self.config.max_model_len = vllm_config.model_config.max_model_len
         self.model = MiniMaxText01Model(
@@ -686,15 +683,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         )
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 self.config.hidden_size,
-                org_num_embeddings=self.config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
 
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, self.config.vocab_size
+                config.vocab_size, self.config.vocab_size
             )
 
         else:
@@ -714,8 +709,8 @@ def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index fb7c6d42a065..0939a72ba53e 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -353,7 +353,7 @@ def _parse_and_validate_image_input(
 
         raise AssertionError("This line should be unreachable.")
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -371,8 +371,8 @@ def forward(
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=input_ids == self.config.image_token_index,
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 26d4deca2e12..1ddb470a0f93 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -549,7 +549,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c1f411b6cd2a..d7a1cb82fb4f 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -345,7 +345,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -359,7 +359,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -591,8 +591,8 @@ def update_physical_experts_metadata(
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 4548abde77d5..e25a104d822a 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -35,6 +35,7 @@
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -45,6 +46,7 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.utils import initialize_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
@@ -68,11 +70,15 @@
     MixtureOfExperts,
     MultiModalEmbeddings,
     SupportsEagle3,
+    SupportsLoRA,
     SupportsMultiModal,
     SupportsPP,
 )
 from .llama4 import Llama4ForCausalLM
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import (
+    AutoWeightsLoader,
+    maybe_prefix,
+)
 from .vision import run_dp_sharded_vision_model
 
 
@@ -724,7 +730,12 @@ def get_dummy_mm_data(
     dummy_inputs=Mllama4DummyInputsBuilder,
 )
 class Llama4ForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsPP, MixtureOfExperts, SupportsEagle3
+    nn.Module,
+    SupportsMultiModal,
+    SupportsPP,
+    MixtureOfExperts,
+    SupportsEagle3,
+    SupportsLoRA,
 ):
     merge_by_field_config = True
 
@@ -865,7 +876,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -1067,6 +1078,17 @@ def _load_other_weights(
 
         return updated_params
 
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.text_config.num_local_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -1113,3 +1135,13 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         )
 
         return updated_params
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector.",
+            tower_model="vision_model.",
+        )
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 4901ac74fb28..48604d8e5103 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -123,7 +123,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
                     VocabParallelEmbedding(
                         config.vocab_size,
                         self.inner_dim,
-                        org_num_embeddings=config.vocab_size,
                     )
                     for _ in range(self.max_speculative_tokens)
                 ]
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 5a0769f3bdaa..3a8a6c74d9d1 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -46,7 +46,7 @@ def __init__(self, config: ModernBertConfig):
         )
         self.norm = nn.LayerNorm(config.hidden_size, eps=eps, bias=config.norm_bias)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.tok_embeddings(input_ids)
 
     def forward(
@@ -225,8 +225,8 @@ def __init__(
             config.hidden_size, eps=config.norm_eps, bias=config.norm_bias
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embeddings.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         weights = self.hf_to_vllm_mapper.apply(weights)
@@ -337,8 +337,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         self_weights = []
@@ -424,8 +424,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             }
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self, skip_prefixes=["drop"])
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index dce94d181c4c..ab83a271e30a 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -832,7 +832,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -1404,10 +1404,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         self.multimodal_config = multimodal_config
-        self.lora_config = lora_config
 
         vision_config = VisionBackboneConfig()
         self.vision_backbone = MolmoVisionBackbone(config, vision_config, quant_config)
@@ -1492,7 +1491,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 29e887c4d9c9..106ad971a321 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -248,7 +248,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.d_model
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -262,7 +262,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -308,8 +308,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.transformer.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 86fc1d6046ce..cb39c2ae482d 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -655,7 +655,7 @@ def get_video_repl(
         The replacement returned is not actually used to replace the placeholder
         tokens - it's just used to make sure we allocate the correct number
         of tokens.
-        Actual replacement is done in get_multimodal_embeddings of
+        Actual replacement is done in embed_multimodal of
         NemotronH_Nano_VL_V2
         (specifically in _process_video_input -> _create_final_video_embeddings).
         There, we create the final embeddings with text embeddings for indicator tokens
@@ -1401,7 +1401,7 @@ def _create_final_video_embeddings(
 
         # Create final video embeddings, merging text embeddings for indicator
         # tokens with video embeddings
-        text_embeddings = self.get_language_model().get_input_embeddings(repl_token_ids)
+        text_embeddings = self.get_language_model().embed_input_ids(repl_token_ids)
         final_video_embeddings = _merge_multimodal_embeddings(
             inputs_embeds=text_embeddings,
             multimodal_embeddings=video_embeddings,
@@ -1465,7 +1465,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
 
         return modalities
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         # Validate the multimodal input keyword arguments
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if modalities is None:
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 845798b18d1b..92dcf5ea5700 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -45,7 +45,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -319,24 +318,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
             )
         else:
             self.embed_tokens = PPMissingLayer()
@@ -358,7 +351,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -372,7 +365,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -467,29 +460,20 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         assert isinstance(config, NemotronConfig)
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = NemotronModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -498,7 +482,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -507,8 +491,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index fb58d01be7ba..f7e0caf410e1 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -50,7 +50,6 @@
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -513,21 +512,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         parallel_config = vllm_config.parallel_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.has_moe = "E" in config.hybrid_override_pattern
@@ -556,7 +548,7 @@ def get_layer(prefix: str):
 
         self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -570,7 +562,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -768,7 +760,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         self.quant_config = vllm_config.quant_config
@@ -779,24 +771,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = NemotronHModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
@@ -841,8 +823,8 @@ def update_physical_experts_metadata(
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 17e009612df4..b839206a3094 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -250,25 +249,19 @@ def __init__(
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -298,7 +291,7 @@ def get_layer(prefix: str):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -312,7 +305,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -437,29 +430,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
 
         self.model = self._init_model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -468,7 +449,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -480,8 +461,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
         return DeciModel(vllm_config=vllm_config, prefix=prefix)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 2f78e2f60c93..5a1dda8aac2c 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -561,7 +561,7 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
@@ -580,7 +580,7 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
 
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -593,9 +593,9 @@ def get_input_embeddings(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 390a91d3425c..487e3f671a45 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -268,7 +268,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -285,7 +285,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -368,11 +368,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -381,8 +379,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 7e39f6dff25e..045582c889ee 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -304,7 +304,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], self.config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -408,11 +408,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
                 config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=vllm_config.quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -421,8 +419,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 35a09334a129..499eb05de76e 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -296,7 +296,7 @@ def __init__(
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -310,7 +310,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -471,8 +471,8 @@ def __init__(
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index bf1b7570a882..d13a745beffe 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -753,7 +753,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -767,7 +767,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -969,8 +969,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/openpangu_mtp.py b/vllm/model_executor/models/openpangu_mtp.py
index f4049f2d3970..436b7f981b1f 100644
--- a/vllm/model_executor/models/openpangu_mtp.py
+++ b/vllm/model_executor/models/openpangu_mtp.py
@@ -100,8 +100,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index d124b7671b9c..5df700d1a2e1 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -262,7 +262,7 @@ def __init__(
             prefix=f"{prefix}.layers",
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -274,7 +274,7 @@ def forward(
     ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             pos_embeds = self.embed_positions(positions)
             if self.project_in is not None:
                 inputs_embeds, _ = self.project_in(inputs_embeds)
@@ -311,8 +311,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.decoder.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.decoder.embed_input_ids(input_ids)
 
     def forward(
         self,
@@ -394,8 +394,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index cbfce18b4388..859cd2cecf89 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -255,7 +255,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.hidden_size,
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -269,7 +269,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -345,8 +345,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index b8dad909c547..9db6c317c26a 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -361,7 +361,7 @@ def __init__(
 
         self.total_ut_steps = getattr(self.config, "total_ut_steps", 4)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -374,7 +374,7 @@ def forward(
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
-            hidden_states = self.get_input_embeddings(input_ids)
+            hidden_states = self.embed_input_ids(input_ids)
 
         for current_ut in range(self.total_ut_steps):
             residual = None
@@ -462,10 +462,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = OuroModel(
@@ -488,8 +486,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index cc6c9b4e72d7..a0fab820720f 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -514,7 +514,7 @@ def _process_image_input(
 
         return tuple(vision_embeddings)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index f6461ae9a412..85f37cfea10b 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -10,7 +10,7 @@
 import torch.nn as nn
 from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -106,7 +106,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -135,7 +135,7 @@ def _init_backbone(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         model_type = config.model_type
         if model_type == "siglip2_navit":
@@ -617,7 +617,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
 
         return modalities
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 377b41a35578..3ef6470070d1 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -31,7 +31,7 @@
 )
 from transformers.utils import torch_int
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
@@ -61,6 +61,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargs,
 )
@@ -198,23 +199,18 @@ def get_num_image_tokens(
         if image_processor is None:
             image_processor = self.get_image_processor()
 
-        do_resize = True
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
-
-        if do_resize:
-            resized_height, resized_width = smart_resize(
-                height=image_height,
-                width=image_width,
-                factor=patch_size * merge_size,
-                min_pixels=image_processor.min_pixels,
-                max_pixels=image_processor.max_pixels,
-            )
-            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
-        else:
-            preprocessed_size = ImageSize(width=image_width, height=image_height)
+        resized_height, resized_width = smart_resize(
+            height=image_height,
+            width=image_width,
+            factor=patch_size * merge_size,
+            min_pixels=image_processor.min_pixels,
+            max_pixels=image_processor.max_pixels,
+        )
+        preprocessed_size = ImageSize(width=resized_width, height=resized_height)
 
         grid_t = 1
         grid_h = preprocessed_size.height // patch_size
@@ -227,8 +223,18 @@ def get_num_image_tokens(
 
     def get_image_size_with_most_features(self) -> ImageSize:
         hf_config = self.get_hf_config()
-        image_size = hf_config.vision_config.image_size
-        return ImageSize(height=image_size, width=image_size)
+
+        # See `smart_resize` for the calculation of the image size.
+        merge_size = hf_config.vision_config.spatial_merge_size
+        patch_size = hf_config.vision_config.patch_size
+        factor = merge_size * patch_size
+        max_num_tokens = self.get_image_processor().max_pixels // (factor**2)
+        # Find factors of max_num_tokens close to its square root
+        # to create a dummy image with a reasonable aspect ratio.
+        h_patches = int(math.sqrt(max_num_tokens))
+        max_num_tokens -= max_num_tokens % h_patches
+        w_patches = max_num_tokens // h_patches
+        return ImageSize(height=h_patches * factor, width=w_patches * factor)
 
 
 class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]):
@@ -574,8 +580,8 @@ def __init__(
         projection_size: int,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
-        attn_backend_override: _Backend | None = None,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
+        attn_backend_override: AttentionBackendEnum | None = None,
         use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
@@ -615,8 +621,8 @@ def __init__(
             )
         )
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
@@ -674,10 +680,10 @@ def forward(
                 cu_seqlens,
                 max_seqlen,
                 batch_size,
-                self.attn_backend == _Backend.ROCM_AITER_FA,
+                self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
                 self.use_upstream_fa,
             )
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             outputs = []
             for i in range(1, len(cu_seqlens)):
                 start_idx = cu_seqlens[i - 1]
@@ -696,7 +702,7 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             if seqlens is None:
                 raise ValueError("xFormers attention backend requires seqlens tensor.")
             context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
@@ -780,8 +786,8 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
-        attn_backend_override: _Backend | None = None,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
+        attn_backend_override: AttentionBackendEnum | None = None,
         use_upstream_fa: bool = False,
     ):
         super().__init__()
@@ -841,7 +847,7 @@ def __init__(
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -855,16 +861,16 @@ def __init__(
         )
         self.use_upstream_fa = False
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         } and check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
             self.use_upstream_fa = True
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"PaddleOCR-VL does not support {self.attn_backend} backend now."
@@ -937,9 +943,12 @@ def forward(
 
         max_seqlen = None
         seqlens = None
-        if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}:
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
 
         hidden_states = inputs_embeds
@@ -960,7 +969,7 @@ def __init__(
         config: PretrainedConfig,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -1010,7 +1019,7 @@ def __init__(
         config,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
@@ -1175,17 +1184,17 @@ def compute_logits(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float],
-        context_len: int = 0,
-        seq_len: int | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         vision_start_token_id = hf_config.vision_start_token_id
@@ -1222,20 +1231,12 @@ def get_mrope_input_positions(
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_second_per_grid_t = 1.0
                 if second_per_grid_ts:
                     video_second_per_grid_t = second_per_grid_ts[video_index]
@@ -1293,7 +1294,6 @@ def get_mrope_input_positions(
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
 
@@ -1327,10 +1327,10 @@ def forward(
             inputs_embeds = None
 
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            vision_embeddings = self.embed_multimodal(**kwargs)
             is_multimodal = kwargs.pop("is_multimodal", None)
             handle_oov_mm_token = kwargs.pop("handle_oov_mm_token", False)
-            inputs_embeds = self.get_input_embeddings(
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=is_multimodal,
@@ -1390,7 +1390,7 @@ def _process_image_input(
         image_embeds = self.mlp_AR(vision_outputs, image_grid_thw)
         return image_embeds
 
-    def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return ()
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index fb0b4b290467..ec5d0fa6226d 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -375,7 +375,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 37a7108d5c01..3bf6a1d9763d 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -270,7 +270,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -284,7 +284,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -347,8 +347,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 34db124b6447..8fee53c23fb4 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -240,7 +240,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -254,7 +254,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -323,11 +323,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         # lm_head use bias, cannot share word embeddings
         assert not config.tie_word_embeddings
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
 
@@ -347,8 +346,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b86fe67fb476..384572217bc1 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -591,7 +591,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             quant_config=self.quant_config,
             prefix=maybe_prefix(prefix, "model.embed_tokens"),
         )
@@ -665,14 +664,14 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -680,7 +679,7 @@ def get_input_embeddings(
         is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self._get_text_embeddings(
+        inputs_embeds = self._embed_text_input_ids(
             input_ids,
             self.embed_tokens,
             is_multimodal=is_multimodal,
@@ -692,7 +691,7 @@ def get_input_embeddings(
 
         if is_multimodal is None:
             raise ValueError(
-                "`get_input_embeddings` now requires `is_multimodal` arg, "
+                "`embed_input_ids` now requires `is_multimodal` arg, "
                 "please update your model runner according to "
                 "https://github.com/vllm-project/vllm/pull/16229."
             )
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index 4799b7aba7f7..0f1230a55bae 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -1371,7 +1371,7 @@ def _process_image_input(
             )
         return image_embeds
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index acad72b058fc..8425549a7bd2 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -21,7 +21,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
 )
 from vllm.model_executor.models.llama import LlamaModel
@@ -1023,12 +1022,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         multimodal_config = vllm_config.model_config.multimodal_config
         assert multimodal_config, "multimodal_config is required"
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.multimodal_config = multimodal_config
         self.quant_config = quant_config
-        self.lora_config = lora_config
 
         # Tensor/Pipeline parallel not supported for now.
         assert get_pp_group().world_size == 1, "pipeline parallel is not supported"
@@ -1055,23 +1052,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
 
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
@@ -1190,7 +1180,7 @@ def _process_image_input(
         )
         return image_embeds
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index c7436cedeb22..92fd858b608b 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -45,7 +45,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -458,22 +457,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+        self.vocab_size = config.vocab_size
+
         self.config = config
         self.quant_config = quant_config
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
@@ -490,7 +482,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -504,7 +496,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -634,42 +626,30 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = vllm_config.quant_config
 
         self.model = PhiMoEModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=(
-                DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size
-            ),
             quant_config=None,
             bias=True,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 0555717017cd..8cb7d6a889da 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -461,7 +461,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -1109,7 +1109,7 @@ def forward(
             )
             out = out.transpose(1, 2)
 
-        out = out.view(batch, patches, self.n_heads * self.head_dim)
+        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
         attn_output, _ = self.o_proj(out)
 
         return attn_output, None
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 6427ccfccc13..0c87f5000ff4 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -46,7 +46,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -751,12 +750,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.org_vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
             prefix=f"{prefix}.embed_tokens",
         )
         self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
@@ -765,7 +762,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.layers = Plamo2Decoder(vllm_config=vllm_config, prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -779,7 +776,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -827,27 +824,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         self.vocab_size = self.config.vocab_size
-        self.unpadded_vocab_size = self.config.vocab_size
-        num_embeddings = ((self.vocab_size + 15) // 16) * 16
         self.lm_head = ParallelLMHead(
-            num_embeddings,
+            self.vocab_size,
             self.config.hidden_size,
-            org_num_embeddings=self.config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             prefix=f"{prefix}.lm_head",
         )
         if self.config.tie_word_embeddings:
             self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
 
         self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, self.config.vocab_size
+            config.vocab_size, self.config.vocab_size
         )
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index c99f628004fb..50a125c3f597 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -221,7 +221,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
     def forward(
@@ -235,7 +235,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b26546647ce7..1bbb969ce5aa 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -355,7 +355,7 @@ def __init__(
 
         self.aux_hidden_state_layers = tuple[int, ...]()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -369,7 +369,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -477,10 +477,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = Qwen2Model(
@@ -506,8 +504,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 7e970ebbe2bb..262ea771d9cd 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -68,6 +68,7 @@
     ImageItem,
     ModalityData,
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     NestedTensors,
@@ -923,23 +924,9 @@ def get_language_model(self) -> torch.nn.Module:
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value (Qwen2.5-Omni version).
-
-        Differences from MRotaryEmbedding:
-            1. Add audio support (and related `audio_feature_lengths`).
-            2. Add `use_audio_in_video` option to read audio from video inputs.
-                In this case, audio and vision position ids will be split into
-                chunks and interleaved.
-
+        """
         Example:
 
             (V_i are vision position ids, A_i are audio position ids)
@@ -947,11 +934,33 @@ def get_mrope_input_positions(
             |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
             |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
         """
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
+        audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
+        use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
+
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
+            image_grid_thw
+        )
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
+            video_grid_thw
+        )
 
         # TODO(fyabc): refactor and share more code with
         #  _vl_get_input_positions_tensor.
 
-        thinker_config = hf_config.thinker_config
+        thinker_config = self.config
         audio_token_id = thinker_config.audio_token_index
         image_token_id = thinker_config.image_token_index
         video_token_id = thinker_config.video_token_index
@@ -965,11 +974,6 @@ def get_mrope_input_positions(
             thinker_config.vision_config, "tokens_per_second", 25
         )
 
-        if isinstance(image_grid_thw, list):
-            image_grid_thw = torch.tensor(image_grid_thw)
-        if isinstance(video_grid_thw, list):
-            video_grid_thw = torch.tensor(video_grid_thw)
-
         src_item = input_tokens
         audio_seqlens = audio_feature_lengths
         if not second_per_grid_ts:
@@ -1125,11 +1129,10 @@ def get_mrope_input_positions(
         mrope_position_delta = (
             torch.cat(llm_pos_ids_list, dim=1).max() + 1 - len(src_item)
         )
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
@@ -1155,7 +1158,7 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
 
     # TODO (ywang96): support overlapping modality embeddings so that
     # `use_audio_in_video` will work on V1.
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1165,16 +1168,16 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def get_multimodal_embeddings_v0(self, **kwargs: object) -> NestedTensors | None:
+    def embed_multimodal_v0(self, **kwargs: object) -> NestedTensors | None:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         image_input = self._parse_and_validate_image_input(**kwargs)
         video_input = self._parse_and_validate_video_input(**kwargs)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index d337f1606943..2e4fd9645d88 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -26,7 +26,6 @@
 # limitations under the License.
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import lru_cache, partial
 from typing import Annotated, Any, Literal, TypeAlias
@@ -35,14 +34,14 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig,
     Qwen2_5_VLVisionConfig,
 )
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
@@ -56,15 +55,16 @@
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
     QKVParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.vision import should_torch_compile_mm_vit
@@ -75,7 +75,11 @@
     compute_retention_mask,
     recompute_mrope_positions,
 )
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.sequence import IntermediateTensors
@@ -106,7 +110,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -287,25 +290,6 @@ def forward(self, x: torch.Tensor):
         return x_down
 
 
-def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
-    """All-gather the input tensor interleavely across model parallel group."""
-    import torch.distributed as dist
-
-    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
-    dist.all_gather(
-        gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group
-    )
-
-    gathered_tensors_split = [
-        torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors
-    ]
-    ordered_tensors = [
-        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
-    ]
-    result_tensor = torch.cat(ordered_tensors, dim=-1)
-    return result_tensor
-
-
 class Qwen2_5_VisionAttention(nn.Module):
     def __init__(
         self,
@@ -315,9 +299,9 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         use_upstream_fa: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
@@ -364,64 +348,57 @@ def __init__(
         # On ROCm with FLASH_ATTN backend, upstream flash_attn is used
         from vllm.platforms import current_platform
 
-        if current_platform.is_rocm() and self.attn_backend == _Backend.FLASH_ATTN:
+        if (
+            current_platform.is_rocm()
+            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+        ):
             self.use_upstream_fa = True
         if current_platform.is_xpu():
             self.use_upstream_fa = False
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
-        # [s, b, 3 * head * head_dim]
-        seq_len, bs, _ = qkv.shape
-        if self.tp_size > 1:
-            qkv = all_gather_interleave(qkv, self.qkv.hidden_size, self.tp_size)
-
-        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
-        q, k, v = qkv.chunk(3, dim=2)
-
-        # 3 * [s, b, head * head_dim]
-        if self.tp_size > 1:
-            splitter = partial(
-                dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size
-            )
-            q = splitter(q)[self.tp_rank]
-            k = splitter(k)[self.tp_rank]
-            v = splitter(v)[self.tp_rank]
-
-        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
-        new_shape = (
-            seq_len,
-            bs,
-            self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head,
-        )
-        q, k, v = (x.view(*new_shape) for x in (q, k, v))
-        return q, k, v
-
     def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
         seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
+        seq_len, batch_size, _ = x.shape
+
+        qkv = einops.rearrange(
+            x,
+            "s b (three head head_dim) -> b s three head head_dim",
+            three=3,
+            head=self.num_attention_heads_per_partition,
+        )
 
-        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
-        q, k, v = self.split_qkv(x)
-        batch_size = q.shape[1]
+        if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
+            qk, v = qkv[:, :, :2], qkv[:, :, 2]
 
-        q, k, v = (einops.rearrange(x, "s b ... -> b s ...") for x in (q, k, v))
-        if rotary_pos_emb is not None:
-            # [2 * b, s, heads, head_dim]
-            qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
-            q, k = torch.chunk(qk_rotated, 2, dim=0)
+            qk_reshaped = einops.rearrange(
+                qk, "b s two head head_dim -> (two b) s head head_dim", two=2
+            )
+            qk_rotated = apply_rotary_pos_emb_vision(
+                qk_reshaped, cos=rotary_pos_emb_cos, sin=rotary_pos_emb_sin
+            )
+            qk_rotated = qk_rotated.view(
+                2,
+                batch_size,
+                seq_len,
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            )
+            q, k = qk_rotated.unbind(dim=0)
+        else:
+            q, k, v = qkv.unbind(dim=2)
 
         if self.is_flash_attn_backend:
             context_layer = vit_flash_attn_wrapper(
@@ -431,10 +408,10 @@ def forward(
                 cu_seqlens,
                 max_seqlen,
                 batch_size,
-                self.attn_backend == _Backend.ROCM_AITER_FA,
+                self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
                 self.use_upstream_fa,
             )
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             from vllm.platforms import current_platform
 
@@ -450,7 +427,7 @@ def forward(
                 v,
                 cu_seqlens,
             )
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
 
         output, _ = self.proj(context_layer)
@@ -461,7 +438,8 @@ def forward(
     dynamic_arg_dims={
         "x": 0,
         "cu_seqlens": 0,
-        "rotary_pos_emb": 0,
+        "rotary_pos_emb_cos": 0,
+        "rotary_pos_emb_sin": 0,
         "seqlens": 0,
     },
     mark_unbacked_dims={"seqlens": 0},
@@ -478,9 +456,9 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         use_upstream_fa: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -512,14 +490,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
         seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -548,15 +528,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=False,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -612,42 +595,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
-class Qwen2_5_VisionRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int, theta: float = 10000.0) -> None:
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        inv_freq = 1.0 / (
-            theta ** (torch.arange(0, dim, 2, dtype=torch.float, device="cpu") / dim)
-        )
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self._seq_len_cached = 0
-        self._freqs_cached = None
-
-    def update_freqs_cache(self, seqlen: int) -> None:
-        if seqlen > self._seq_len_cached:
-            seqlen *= 2
-            self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (
-                self.theta
-                ** (
-                    torch.arange(
-                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
-                    )
-                    / self.dim
-                )
-            )
-            seq = torch.arange(
-                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.outer(seq, self.inv_freq)
-            self._freqs_cached = freqs
-
-    def forward(self, seqlen: int) -> torch.Tensor:
-        self.update_freqs_cache(seqlen)
-        return self._freqs_cached[:seqlen]
-
-
 class Qwen2_5_VisionTransformer(nn.Module):
     def __init__(
         self,
@@ -656,7 +603,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -690,7 +637,13 @@ def __init__(
 
         norm_layer = partial(RMSNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
 
         use_upstream_fa = False
         self.attn_backend = get_vit_attn_backend(
@@ -708,10 +661,10 @@ def __init__(
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Qwen2.5-VL does not support {self.attn_backend} backend now."
@@ -781,15 +734,30 @@ def rotary_pos_emb_thw(self, t, h, w):
         )
         pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
         max_size = max(h, w)
-        rotary_pos_emb_full = self.rotary_pos_emb(max_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        rotary_pos_emb = rotary_pos_emb.reshape(
-            rotary_pos_emb.shape[0] // self.spatial_merge_unit,
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+
+        cos_combined = cos_combined.reshape(
+            cos_combined.shape[0] // self.spatial_merge_unit,
+            self.spatial_merge_unit,
+            -1,
+        )
+        sin_combined = sin_combined.reshape(
+            sin_combined.shape[0] // self.spatial_merge_unit,
             self.spatial_merge_unit,
             -1,
         )
 
-        return rotary_pos_emb
+        return cos_combined, sin_combined
 
     def get_window_index_thw(self, grid_t, grid_h, grid_w):
         vit_merger_window_size = (
@@ -831,14 +799,19 @@ def get_window_index_thw(self, grid_t, grid_h, grid_w):
     @lru_cache(maxsize=1024)  # noqa: B019
     def get_rope_by_thw(self, t, h, w):
         window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw(t, h, w)
-        rotary_pos_emb_thw = self.rotary_pos_emb_thw(t, h, w)
-        rotary_pos_emb_thw = rotary_pos_emb_thw[window_index_thw, :, :]
-        rotary_pos_emb_thw = rotary_pos_emb_thw.flatten(start_dim=0, end_dim=1)
+        cos_thw, sin_thw = self.rotary_pos_emb_thw(t, h, w)
+
+        cos_thw = cos_thw[window_index_thw, :, :]
+        cos_thw = cos_thw.flatten(start_dim=0, end_dim=1)
+        sin_thw = sin_thw[window_index_thw, :, :]
+        sin_thw = sin_thw.flatten(start_dim=0, end_dim=1)
+
         cu_seqlens_thw = torch.repeat_interleave(
             torch.tensor([h * w], dtype=torch.int32), t
         )
         return (
-            rotary_pos_emb_thw,
+            cos_thw,
+            sin_thw,
             window_index_thw,
             cu_seqlens_window_thw,
             cu_seqlens_thw,
@@ -850,9 +823,12 @@ def compute_attn_mask_seqlen(
     ) -> tuple[torch.Tensor, torch.Tensor]:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
         seqlens = torch.zeros(1, device=cu_seqlens.device)
-        if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}:
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
         return max_seqlen, seqlens
 
@@ -870,7 +846,8 @@ def forward(
     ) -> torch.Tensor:
         # patchify
         seq_len, _ = x.size()
-        rotary_pos_emb = []
+        rotary_pos_emb_cos = []
+        rotary_pos_emb_sin = []
         window_index: list = []
         cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
         cu_seqlens: list = []
@@ -886,7 +863,8 @@ def forward(
             llm_w = w // self.spatial_merge_size
 
             (
-                rotary_pos_emb_thw,
+                cos_thw,
+                sin_thw,
                 window_index_thw,
                 cu_seqlens_window_thw,
                 cu_seqlens_thw,
@@ -899,11 +877,13 @@ def forward(
             cu_window_seqlens_last = cu_seqlens_window_thw[-1]
             cu_window_seqlens.append(cu_seqlens_window_thw)
 
-            rotary_pos_emb.append(rotary_pos_emb_thw)
+            rotary_pos_emb_cos.append(cos_thw)
+            rotary_pos_emb_sin.append(sin_thw)
 
             cu_seqlens.append(cu_seqlens_thw)
 
-        rotary_pos_emb = torch.cat(rotary_pos_emb)
+        rotary_pos_emb_cos = torch.cat(rotary_pos_emb_cos)
+        rotary_pos_emb_sin = torch.cat(rotary_pos_emb_sin)
         window_index = torch.cat(window_index)
         # compute reverse indices
         reverse_indices = self.invert_permutation(window_index)
@@ -922,7 +902,12 @@ def forward(
 
         cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
         cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
-        rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(
+            device=self.device, non_blocking=True
+        )
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(
+            device=self.device, non_blocking=True
+        )
         window_index = window_index.to(device=hidden_states.device, non_blocking=True)
         reverse_indices = reverse_indices.to(
             device=hidden_states.device, non_blocking=True
@@ -949,7 +934,8 @@ def forward(
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens_now,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen_now,
                 seqlens=seqlens_now,
             )
@@ -977,9 +963,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -1114,17 +1097,17 @@ class Qwen2_5_VLForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        second_per_grid_ts: list[float],
-        context_len: int = 0,
-        seq_len: int | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         vision_start_token_id = hf_config.vision_start_token_id
@@ -1161,20 +1144,12 @@ def get_mrope_input_positions(
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_second_per_grid_t = 1.0
                 if second_per_grid_ts:
                     video_second_per_grid_t = second_per_grid_ts[video_index]
@@ -1232,7 +1207,6 @@ def get_mrope_input_positions(
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
 
@@ -1563,7 +1537,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 4de6a19c1ff0..7e883a393aa8 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -439,7 +439,7 @@ def _process_audio_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return []
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index c03bd6a3c6d7..2ff0d19df238 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -389,7 +389,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -403,7 +403,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -566,8 +566,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index e2ba0e262cf7..eac46e0f8b05 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -43,10 +43,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = Qwen2Model(
@@ -75,8 +73,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 9206ac8f9d03..53df5972a8fe 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -25,7 +25,6 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from typing import Annotated, Any, Literal, TypeAlias
@@ -33,8 +32,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange, repeat
-from transformers import BatchFeature, PretrainedConfig
+from einops import rearrange
+from transformers import BatchFeature
 from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
     Qwen2VLConfig,
@@ -43,24 +42,26 @@
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
     check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
-from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
+from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.rotary_embedding.common import (
+    apply_rotary_emb_torch,
     dispatch_rotary_emb_function,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -70,6 +71,7 @@
     ImageItem,
     ModalityData,
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
     VideoItem,
@@ -106,7 +108,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -276,47 +277,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(
-            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
-        )
-
-
-def apply_rotary_emb_torch(
-    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+def apply_rotary_pos_emb_vision(
+    t: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
 ) -> torch.Tensor:
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(
-        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    sin = repeat(
-        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    rotary_emb_function = dispatch_rotary_emb_function(
+        default=partial(apply_rotary_emb_torch, is_neox_style=True)
     )
-    return torch.cat(
-        [
-            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
-            x[..., ro_dim:],
-        ],
-        dim=-1,
-    )
-
-
-def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
-    rotary_emb_function = dispatch_rotary_emb_function(default=apply_rotary_emb_torch)
-    t_ = t.float()
-    cos = freqs.cos()
-    sin = freqs.sin()
-    output = rotary_emb_function(t_, cos, sin).type_as(t)
+    output = rotary_emb_function(t, cos, sin).type_as(t)
     return output
 
 
@@ -329,7 +296,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
@@ -378,38 +345,27 @@ def __init__(
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Qwen2-VL does not support {self.attn_backend} backend now."
             )
 
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
-        if self.tp_size > 1:
-            qkv = tensor_model_parallel_all_gather(qkv)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
 
-        # 3 * [s, b, head * head_dim]
-        if self.tp_size > 1:
-            splitter = partial(
-                dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size
-            )
-            q = splitter(q)[self.tp_rank]
-            k = splitter(k)[self.tp_rank]
-            v = splitter(v)[self.tp_rank]
-
         # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
         new_shape = (
             seq_len,
@@ -424,7 +380,8 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
         seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
@@ -436,11 +393,13 @@ def forward(
         batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...") for x in (q, k, v))
-        if rotary_pos_emb is not None:
-            # [2 * b, s, heads, head_dim]
-            qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
-            q, k = torch.chunk(qk_rotated, 2, dim=0)
+
+        # [2 * b, s, heads, head_dim]
+        qk_concat = torch.cat([q, k], dim=0)
+        qk_rotated = apply_rotary_pos_emb_vision(
+            qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin
+        )
+        q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
@@ -460,7 +419,7 @@ def forward(
             context_layer = rearrange(
                 output, "(b s) h d -> s b (h d)", b=batch_size
             ).contiguous()
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             from vllm.platforms import current_platform
 
@@ -485,7 +444,7 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
@@ -515,7 +474,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -546,14 +505,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
         seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -576,15 +537,18 @@ def __init__(
         self.embed_dim = embed_dim
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             embed_dim,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=False,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.embed_dim)
         return x
 
 
@@ -637,40 +601,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
-class Qwen2VisionRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int, theta: float = 10000.0) -> None:
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self._seq_len_cached = 0
-        self._freqs_cached = None
-
-    def update_freqs_cache(self, seqlen: int) -> None:
-        if seqlen > self._seq_len_cached:
-            seqlen *= 2
-            self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (
-                self.theta
-                ** (
-                    torch.arange(
-                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
-                    )
-                    / self.dim
-                )
-            )
-            seq = torch.arange(
-                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.outer(seq, self.inv_freq)
-            self._freqs_cached = freqs
-
-    def forward(self, seqlen: int) -> torch.Tensor:
-        self.update_freqs_cache(seqlen)
-        return self._freqs_cached[:seqlen]
-
-
 class Qwen2VisionTransformer(nn.Module):
     def __init__(
         self,
@@ -679,7 +609,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
 
@@ -709,7 +639,13 @@ def __init__(
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = embed_dim // num_heads
-        self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
 
         self.blocks = nn.ModuleList(
             [
@@ -739,10 +675,11 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -752,7 +689,9 @@ def dtype(self) -> torch.dtype:
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
-    def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor:
+    def rot_pos_emb(
+        self, grid_thw: list[list[int]]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         pos_ids = []
         max_grid_size = 0
         for t, h, w in grid_thw:
@@ -781,17 +720,29 @@ def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor:
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
             max_grid_size = max(max_grid_size, h, w)
         pos_ids = torch.cat(pos_ids, dim=0)
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        return cos_combined, sin_combined
 
     def compute_attn_mask_seqlen(
         self, cu_seqlens: torch.Tensor
     ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
-        if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}:
+        if self.attn_backend in {
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
+        }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
@@ -811,7 +762,7 @@ def forward(
             grid_thw_list = grid_thw.tolist()
 
         # compute position embedding
-        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
 
         # compute cu_seqlens
         cu_seqlens = torch.repeat_interleave(
@@ -829,7 +780,8 @@ def forward(
             x = blk(
                 x,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
                 seqlens=seqlens,
             )
@@ -850,9 +802,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -1236,23 +1185,17 @@ class Qwen2VLForConditionalGeneration(
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get M-RoPE input positions for Qwen2-VL model."""
-        if image_grid_thw is None:
-            image_grid_thw = []
-        if video_grid_thw is None:
-            video_grid_thw = []
-        if second_per_grid_ts is None:
-            second_per_grid_ts = []
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
 
+        hf_config = self.config
         image_token_id = hf_config.image_token_id
         video_token_id = hf_config.video_token_id
         vision_start_token_id = hf_config.vision_start_token_id
@@ -1289,20 +1232,12 @@ def get_mrope_input_positions(
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
+                t, h, w = image_grid_thw[image_index]
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
             else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
+                t, h, w = video_grid_thw[video_index]
                 video_second_per_grid_t = 1.0
                 if second_per_grid_ts:
                     video_second_per_grid_t = second_per_grid_ts[video_index]
@@ -1360,7 +1295,6 @@ def get_mrope_input_positions(
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions, mrope_position_delta
 
@@ -1528,7 +1462,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 563d3cc23d72..8d7f22a33fe6 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -272,10 +272,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = Qwen3Model(
@@ -308,8 +306,8 @@ def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index a7e6772bb708..96751fee800b 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -43,6 +43,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -171,6 +172,7 @@ def __init__(
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
+            routing_method_type=RoutingMethodType.Renormalize,
         )
 
         self.gate = ReplicatedLinear(
@@ -425,7 +427,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Track layers for auxiliary hidden state outputs (EAGLE3)
         self.aux_hidden_state_layers: tuple[int, ...] = ()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -439,7 +441,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -712,8 +714,8 @@ def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 55bbad7a8b27..86508a7c6431 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -34,6 +34,7 @@
     fused_recurrent_gated_delta_rule,
 )
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
 from vllm.model_executor.layers.layernorm import (
     GemmaRMSNorm as Qwen3NextRMSNorm,
 )
@@ -58,7 +59,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -173,6 +173,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
+            routing_method_type=RoutingMethodType.Renormalize,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -585,7 +586,7 @@ def _forward_core(
                 self.conv1d.bias,
                 self.activation,
                 conv_state_indices=non_spec_state_indices_tensor[
-                    : attn_metadata.num_decodes
+                    : attn_metadata.num_actual_tokens
                 ],
                 validate_data=True,
             )
@@ -965,22 +966,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         config: Qwen3NextConfig = vllm_config.model_config.hf_config
         parallel_config = vllm_config.parallel_config
-        lora_config = vllm_config.lora_config
+
         eplb_config = parallel_config.eplb_config
         self.num_redundant_experts = eplb_config.num_redundant_experts
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         def get_layer(prefix: str):
@@ -1002,7 +998,7 @@ def get_layer(prefix: str):
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -1016,7 +1012,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -1194,7 +1190,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
         assert not cache_config.enable_prefix_caching, (
             "Qwen3Next currently does not support prefix caching"
@@ -1207,23 +1203,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = Qwen3NextModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
@@ -1231,8 +1217,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Set MoE hyperparameters
         self.set_moe_parameters()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py
index 271b76adcff7..83694caa5248 100644
--- a/vllm/model_executor/models/qwen3_next_mtp.py
+++ b/vllm/model_executor/models/qwen3_next_mtp.py
@@ -15,7 +15,6 @@
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -48,17 +47,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         model_config = vllm_config.model_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         config: Qwen3NextConfig = model_config.hf_config
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         self.mtp_start_layer_idx = config.num_hidden_layers
         self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1)
@@ -66,7 +60,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.fc = ColumnParallelLinear(
@@ -100,7 +93,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.hidden_size, eps=config.rms_norm_eps
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -114,7 +107,7 @@ def forward(
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings(input_ids)
+                inputs_embeds = self.embed_input_ids(input_ids)
             assert hidden_states.shape[-1] == inputs_embeds.shape[-1]
             inputs_embeds = self.pre_fc_norm_embedding(inputs_embeds)
             hidden_states = self.pre_fc_norm_hidden(hidden_states)
@@ -252,24 +245,20 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = Qwen3NextMultiTokenPredictor(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "mtp")
         )
-        self.unpadded_vocab_size = config.vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
         self.set_moe_parameters()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index f20e67902721..8274b92138f7 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -22,7 +22,6 @@
 # limitations under the License.
 """Inference-only Qwen3-Omni-Moe model (thinker part)."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from typing import Any
@@ -47,25 +46,26 @@
 )
 from transformers.models.whisper import WhisperFeatureExtractor
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargsItems
+from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
 from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
     BaseMultiModalProcessor,
@@ -91,7 +91,6 @@
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
-    Qwen2_5_VisionRotaryEmbedding,
     Qwen2_5_VLProcessingInfo,
 )
 from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
@@ -102,7 +101,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_llm_pos_ids_for_vision,
     get_vit_attn_backend,
 )
@@ -138,16 +136,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=True,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         L, C = x.shape
-        x = self.proj(x)
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -221,14 +221,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
         seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -301,7 +303,7 @@ def __init__(
         norm_eps: float = 1e-6,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = vision_config.hidden_size
@@ -332,7 +334,13 @@ def __init__(
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
 
         self.blocks = nn.ModuleList(
             [
@@ -377,10 +385,11 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
-            torch.get_default_dtype()
+        if (
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -415,9 +424,19 @@ def rot_pos_emb(self, grid_thw):
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+
+        return cos_combined, sin_combined
 
     def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
         num_grid_per_side = self.num_grid_per_side
@@ -490,9 +509,9 @@ def compute_attn_mask_seqlen(
     ) -> tuple[torch.Tensor, torch.Tensor]:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
         seqlens = torch.zeros(1, device=cu_seqlens.device)
-        if self.attn_backend == _Backend.FLASH_ATTN:
+        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
         return max_seqlen, seqlens
 
@@ -507,7 +526,7 @@ def forward(
         if self.apply_vit_abs_pos_embed:
             pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
             hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw)
 
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
@@ -518,7 +537,8 @@ def forward(
         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
 
         hidden_states = hidden_states.unsqueeze(1)
-        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device)
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device)
         max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
 
         hidden_states_list = []
@@ -528,7 +548,8 @@ def forward(
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
                 seqlens=seqlens,
             )
@@ -565,9 +586,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -612,7 +630,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -1251,9 +1269,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
@@ -1277,7 +1293,7 @@ def get_multimodal_embeddings(
                 multimodal_embeddings += tuple(audio_embeddings)
         return multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1285,9 +1301,9 @@ def get_input_embeddings(
         is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self._get_text_embeddings(
+        inputs_embeds = self._embed_text_input_ids(
             input_ids,
-            self.language_model.get_input_embeddings,
+            self.language_model.embed_input_ids,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
@@ -1413,41 +1429,48 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        config = hf_config.thinker_config
-        if isinstance(image_grid_thw, list):
-            image_grid_thw = torch.tensor(image_grid_thw)
-        if isinstance(video_grid_thw, list):
-            video_grid_thw = torch.tensor(video_grid_thw)
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
+        audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
+        use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
+
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
+            image_grid_thw
+        )
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
+            video_grid_thw
+        )
+
         input_ids = torch.tensor(input_tokens)
         if input_ids is None or input_ids.ndim != 1:
             raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
 
         seq_len = input_ids.shape[0]
-        if audio_feature_lengths is not None and not isinstance(
-            audio_feature_lengths, torch.Tensor
-        ):
-            audio_feature_lengths = torch.as_tensor(
+
+        if isinstance(audio_feature_lengths, list):
+            audio_feature_lengths = torch.tensor(
                 audio_feature_lengths, dtype=torch.long
             )
-        if second_per_grid_ts is None:
-            if video_grid_thw is not None and video_grid_thw.numel() > 0:
-                second_per_grids = torch.ones(
-                    video_grid_thw.shape[0], dtype=torch.float32
-                )
-            else:
-                second_per_grids = torch.tensor([], dtype=torch.float32)
+
+        if not len(second_per_grid_ts) and len(video_grid_thw):
+            second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
         else:
             second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
 
+        config = self.config
         spatial_merge_size = config.vision_config.spatial_merge_size
         image_token_id = config.image_token_id
         video_token_id = config.video_token_id
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 2d8f431bb8fa..99a4007ef7f2 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -24,9 +24,8 @@
 # limitations under the License.
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
 
-import math
-from collections.abc import Callable, Iterable, Mapping, Sequence
-from functools import partial
+from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
+from functools import lru_cache, partial
 from itertools import islice
 from typing import Any
 
@@ -34,7 +33,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
     smart_resize as image_smart_resize,
@@ -49,7 +48,7 @@
 )
 from transformers.video_utils import VideoMetadata
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -57,19 +56,21 @@
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalKwargsItem,
     MultiModalKwargsItems,
@@ -95,7 +96,6 @@
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
-    Qwen2_5_VisionRotaryEmbedding,
     Qwen2_5_VLImageEmbeddingInputs,
     Qwen2_5_VLImageInputs,
     Qwen2_5_VLImagePixelInputs,
@@ -113,7 +113,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -138,15 +137,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=True,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -198,7 +200,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend: _Backend = _Backend.TORCH_SDPA,
+        attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
@@ -230,14 +232,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
         seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -306,7 +310,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = vision_config.hidden_size
@@ -337,7 +341,13 @@ def __init__(
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
 
         self.merger = Qwen3_VisionPatchMerger(
             d_model=vision_config.out_hidden_size,
@@ -372,18 +382,18 @@ def __init__(
         )
         use_upstream_fa = False
         if (
-            self.attn_backend != _Backend.FLASH_ATTN
-            and self.attn_backend != _Backend.ROCM_AITER_FA
+            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
+            and self.attn_backend != AttentionBackendEnum.ROCM_AITER_FA
             and check_upstream_fa_availability(torch.get_default_dtype())
         ):
-            self.attn_backend = _Backend.FLASH_ATTN
+            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
             use_upstream_fa = True
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
                 f"Qwen3-VL does not support {self.attn_backend} backend now."
@@ -414,34 +424,55 @@ def dtype(self) -> torch.dtype:
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
+    @staticmethod
+    @lru_cache(maxsize=1024)
+    def rot_pos_ids(h: int, w: int, spatial_merge_size: int) -> torch.Tensor:
+        hpos_ids = np.broadcast_to(np.arange(h).reshape(h, 1), (h, w))
+        h_div = h // spatial_merge_size
+        w_div = w // spatial_merge_size
+        hpos_ids = hpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        hpos_ids = hpos_ids.transpose(0, 2, 1, 3)
+        hpos_ids = hpos_ids.flatten()
+
+        wpos_ids = np.broadcast_to(np.arange(w).reshape(1, w), (h, w))
+        wpos_ids = wpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        wpos_ids = wpos_ids.transpose(0, 2, 1, 3)
+        wpos_ids = wpos_ids.flatten()
+
+        return torch.from_numpy(np.stack([hpos_ids, wpos_ids], axis=-1))
+
     def rot_pos_emb(self, grid_thw: list[list[int]]):
-        pos_ids = []
         max_grid_size = max(max(h, w) for _, h, w in grid_thw)
-        for t, h, w in grid_thw:
-            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
-            hpos_ids = hpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
-            hpos_ids = hpos_ids.flatten()
-
-            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-            wpos_ids = wpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
-            wpos_ids = wpos_ids.flatten()
-            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = [
+            self.rot_pos_ids(h, w, self.spatial_merge_size)
+            if t == 1
+            else self.rot_pos_ids(h, w, self.spatial_merge_size).repeat(t, 1)
+            for t, h, w in grid_thw
+        ]
         pos_ids = torch.cat(pos_ids, dim=0)
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+
+        return cos_combined, sin_combined
 
     def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
         num_grid_per_side = self.num_grid_per_side
@@ -491,8 +522,8 @@ def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
             weights = weights.to(dtype=self.dtype)
 
             embeds = self.pos_embed(indices)
-            weighted_embeds = embeds * weights
-            combined = weighted_embeds.sum(dim=0)
+            embeds *= weights
+            combined = embeds.sum(dim=0)
 
             combined = combined.reshape(
                 h // m_size, m_size, w // m_size, m_size, hidden_dim
@@ -510,11 +541,11 @@ def compute_attn_mask_seqlen(
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
         seqlens = torch.zeros(1, device=cu_seqlens.device)
         if (
-            self.attn_backend == _Backend.FLASH_ATTN
-            or self.attn_backend == _Backend.ROCM_AITER_FA
+            self.attn_backend == AttentionBackendEnum.FLASH_ATTN
+            or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == _Backend.XFORMERS:
+        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
             seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
         return max_seqlen, seqlens
 
@@ -534,8 +565,13 @@ def forward(
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
         hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
-        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(
+            hidden_states.device, non_blocking=True
+        )
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(
+            hidden_states.device, non_blocking=True
+        )
 
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
@@ -551,7 +587,8 @@ def forward(
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
                 seqlens=seqlens,
             )
@@ -578,9 +615,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -1099,7 +1133,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -1135,13 +1169,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super(Qwen3ForCausalLM, self).__init__()
         config = vllm_config.model_config.hf_config.text_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix)
+        self.model = Qwen3LLMModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
@@ -1413,121 +1447,63 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
                 )
         return mm_input_by_modality
 
+    def iter_mm_grid_hw(
+        self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec]
+    ) -> Iterator[tuple[int, int, int]]:
+        video_token_id = self.config.video_token_id
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset):
+            offset = mm_feature.mm_position.offset
+            if mm_feature.modality == "image":
+                t, h, w = mm_feature.data["image_grid_thw"].data.tolist()
+                assert t == 1, f"Image must have 1 frame, got {t}"
+                yield offset, h // spatial_merge_size, w // spatial_merge_size
+            elif mm_feature.modality == "video":
+                t, h, w = mm_feature.data["video_grid_thw"].data.tolist()
+                llm_grid_h = h // spatial_merge_size
+                llm_grid_w = w // spatial_merge_size
+                for _ in range(t):
+                    offset = input_tokens.index(video_token_id, offset)
+                    yield offset, llm_grid_h, llm_grid_w
+                    offset += llm_grid_h * llm_grid_w
+            else:
+                raise ValueError(f"Unsupported modality: {mm_feature.modality}")
+
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: list[list[int]] | torch.Tensor,
-        video_grid_thw: list[list[int]] | torch.Tensor,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        second_per_grid_ts: list[float] | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
-
-        video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)]
-
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        vision_start_token_id = hf_config.vision_start_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-
-        input_tokens_tensor = torch.tensor(input_tokens)
-        vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id
-        ).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
-        llm_pos_ids_list: list = []
-
+        llm_pos_ids_list = []
         st = 0
-        remain_images, remain_videos = image_nums, video_nums
-
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = (
-                t,
-                h // spatial_merge_size,
-                w // spatial_merge_size,
-            )
-            text_len = ed - st
-
+        for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw(
+            input_tokens, mm_features
+        ):
+            text_len = offset - st
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
             llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
             )
 
-            t_index = (
-                torch.arange(llm_grid_t)
-                .view(-1, 1)
-                .expand(-1, llm_grid_h * llm_grid_w)
-                .flatten()
-            )
-            h_index = (
-                torch.arange(llm_grid_h)
-                .view(1, -1, 1)
-                .expand(llm_grid_t, -1, llm_grid_w)
-                .flatten()
-            )
-            w_index = (
-                torch.arange(llm_grid_w)
-                .view(1, 1, -1)
-                .expand(llm_grid_t, llm_grid_h, -1)
-                .flatten()
-            )
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
-            )
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+            grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1)
+            llm_pos_ids_list.append(grid_indices + text_len + st_idx)
+            st = offset + llm_grid_h * llm_grid_w
 
         if st < len(input_tokens):
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
             text_len = len(input_tokens) - st
             llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
             )
 
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1)
         mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-        return llm_positions, mrope_position_delta
+        return torch.from_numpy(llm_positions), mrope_position_delta
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> MultiModalEmbeddings | None:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
@@ -1589,7 +1565,7 @@ def _compute_deepstack_embeds(
 
         return deepstack_input_embeds, multimodal_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1597,9 +1573,9 @@ def get_input_embeddings(
         is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = self._get_text_embeddings(
+        inputs_embeds = self._embed_text_input_ids(
             input_ids,
-            self.language_model.get_input_embeddings,
+            self.language_model.embed_input_ids,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
@@ -1609,7 +1585,7 @@ def get_input_embeddings(
 
         if is_multimodal is None:
             raise ValueError(
-                "`get_input_embeddings` now requires `is_multimodal` arg, "
+                "`embed_input_ids` now requires `is_multimodal` arg, "
                 "please update your model runner according to "
                 "https://github.com/vllm-project/vllm/pull/16229."
             )
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index 284b1301d07f..5c3205faf9c2 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -97,7 +97,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index cf74f72fe633..6a259cade9cf 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -777,7 +777,7 @@ def _process_image_input(self, image_input: QwenImageInputs) -> torch.Tensor:
     def get_language_model(self) -> torch.nn.Module:
         return self.transformer
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4af8fa01f562..a2de597c87d8 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -56,6 +56,7 @@
 
 _TEXT_GENERATION_MODELS = {
     # [Decoder-only]
+    "AfmoeForCausalLM": ("afmoe", "AfmoeForCausalLM"),
     "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
@@ -596,7 +597,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
                     mi_dict = json.load(file)
             except FileNotFoundError:
                 logger.debug(
-                    ("Cached model info file for class %s.%s not found"),
+                    "Cached model info file for class %s.%s not found",
                     self.module_name,
                     self.class_name,
                 )
@@ -604,7 +605,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
 
             if mi_dict["hash"] != module_hash:
                 logger.debug(
-                    ("Cached model info file for class %s.%s is stale"),
+                    "Cached model info file for class %s.%s is stale",
                     self.module_name,
                     self.class_name,
                 )
@@ -614,7 +615,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
             return _ModelInfo(**mi_dict["modelinfo"])
         except Exception:
             logger.debug(
-                ("Cached model info for class %s.%s error. "),
+                "Cached model info for class %s.%s error. ",
                 self.module_name,
                 self.class_name,
             )
@@ -649,14 +650,14 @@ def inspect_model_cls(self) -> _ModelInfo:
             mi = self._load_modelinfo_from_cache(module_hash)
             if mi is not None:
                 logger.debug(
-                    ("Loaded model info for class %s.%s from cache"),
+                    "Loaded model info for class %s.%s from cache",
                     self.module_name,
                     self.class_name,
                 )
                 return mi
             else:
                 logger.debug(
-                    ("Cache model info for class %s.%s miss. Loading model instead."),
+                    "Cache model info for class %s.%s miss. Loading model instead.",
                     self.module_name,
                     self.class_name,
                 )
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index cfccb904f46c..31cc64509914 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -220,8 +220,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.jina_to_vllm_mapper)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.roberta.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.roberta.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index 641160295afb..bf211d28f184 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -334,7 +334,7 @@ def __init__(
         else:
             self.norm = PPMissingLayer()
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -348,7 +348,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -440,10 +440,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
 
         self.quant_config = quant_config
         self.model = SeedOssModel(
@@ -469,8 +467,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 3cbdd64acc4a..42d906d089f9 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -595,7 +595,7 @@ def __init__(
         self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.head = nn.Linear(embed_dim, config.projection_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embeddings.token_embedding(input_ids)
 
     def forward(
@@ -827,6 +827,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
+        self.quant_config = quant_config
         self.vision_model = SiglipVisionTransformer(
             config,
             quant_config,
@@ -911,12 +912,38 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 break
             else:
                 param = params_dict[name]
+                param = maybe_swap_ffn_param(
+                    name, param, loaded_weight, params_dict, self.quant_config
+                )
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
+def maybe_swap_ffn_param(
+    name: str,
+    param: torch.Tensor,
+    loaded_weight: torch.Tensor,
+    params_dict: dict[str, torch.Tensor],
+    quant_config: QuantizationConfig,
+) -> torch.Tensor:
+    if not (quant_config and quant_config.get_name() == "gguf") or ".fc" not in name:
+        return param
+    # Some GGUF models have fc1 and fc2 weights swapped
+    tp_size = get_tensor_model_parallel_world_size()
+    output_dim = getattr(param, "output_dim", 0)
+    output_size = param.size(output_dim) * tp_size
+    weight_out_size = loaded_weight.size(output_dim)
+    if ".fc1." in name and output_size != weight_out_size:
+        new_name = name.replace(".fc1.", ".fc2.")
+        param = params_dict[new_name]
+    elif ".fc2." in name and output_size != weight_out_size:
+        new_name = name.replace(".fc2.", ".fc1.")
+        param = params_dict[new_name]
+    return param
+
+
 # Adapted from: https://github.com/huggingface/transformers/blob/v4.54.1/src/transformers/models/siglip/modeling_siglip.py#L200
 class SiglipTextEmbeddings(nn.Module):
     def __init__(self, config: SiglipTextConfig):
@@ -1117,7 +1144,7 @@ def _process_image_inputs(self, inputs: SiglipImagePixelInputs) -> torch.Tensor:
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1130,16 +1157,16 @@ def get_input_embeddings(
         )
 
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index bab5c1d82ded..29dd164ad37f 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -12,7 +12,7 @@
 from transformers import Siglip2VisionConfig
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
@@ -25,6 +25,7 @@
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.platforms import current_platform
 
 from .vision import get_vit_attn_backend
 
@@ -188,7 +189,7 @@ def apply_rotary_pos_emb(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
-    if is_flash_attn_backend:
+    if is_flash_attn_backend and not current_platform.is_xpu():
         from flash_attn.layers.rotary import apply_rotary_emb
 
         apply_rotary_emb_func = apply_rotary_emb
@@ -208,7 +209,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -264,14 +265,14 @@ def __init__(
         )
 
         if self.attn_backend not in {
-            _Backend.FLASH_ATTN,
-            _Backend.TORCH_SDPA,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TORCH_SDPA,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }:
-            self.attn_backend = _Backend.TORCH_SDPA
+            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
         self.is_flash_attn_backend = self.attn_backend in {
-            _Backend.FLASH_ATTN,
-            _Backend.ROCM_AITER_FA,
+            AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.ROCM_AITER_FA,
         }
 
     def forward(
@@ -306,9 +307,15 @@ def forward(
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         if self.is_flash_attn_backend:
             attn_output = self.flash_attn_varlen_func(
-                queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen
+                queries,
+                keys,
+                values,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
             ).reshape(seq_length, -1)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
             batch_size = cu_seqlens.shape[0] - 1
             outputs = []
@@ -376,7 +383,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -440,7 +447,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -626,7 +633,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
         self.config = config
@@ -667,7 +674,7 @@ def __init__(
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
-        attn_backend_override: _Backend | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
     ):
         super().__init__()
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 44550ae595d1..d825eb3a1c13 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -872,14 +872,14 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -892,9 +892,9 @@ def get_input_embeddings(
 
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index f0dfce7bc7b6..4ec855f79444 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -46,7 +46,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -277,24 +276,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
             )
         else:
             self.embed_tokens = PPMissingLayer()
@@ -317,7 +310,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states", "residual"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -331,7 +324,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -455,9 +448,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = SolarModel(
@@ -465,18 +458,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -485,7 +469,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
@@ -494,8 +478,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a4e309e0aa6b..06eb7201c1a8 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -246,7 +246,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -260,7 +260,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -332,8 +332,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index d147237808c2..0f2942acd500 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -42,7 +42,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -250,7 +249,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -264,7 +263,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -319,28 +318,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         self.vocab_size = config.vocab_size
-        self.unpadded_vocab_size = config.vocab_size
+
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
-            self.unpadded_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=quant_config,
                 prefix=f"{prefix}.lm_head",
             )
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index a2a1bfd30d8d..4fff356b29e2 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -31,7 +31,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -355,7 +354,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
             ["hidden_states"], config.hidden_size
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
     def forward(
@@ -369,7 +368,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.get_input_embeddings(input_ids)
+                hidden_states = self.embed_input_ids(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -400,28 +399,19 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
         self.vllm_config = vllm_config
 
         self.model = Step3TextModel(vllm_config=vllm_config, prefix=prefix)
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
-            self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size
-            )
+            self.logits_processor = LogitsProcessor(config.vocab_size)
         else:
             self.lm_head = PPMissingLayer()
 
@@ -429,8 +419,8 @@ def __init__(
             self.model.make_empty_intermediate_tensors
         )
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index dbb549ba3f98..5d16be1eb312 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -1075,14 +1075,14 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
         vision_embeddings = self._process_image_input(image_input)
         return vision_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -1093,9 +1093,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
@@ -1113,8 +1113,8 @@ def forward(
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=input_ids == self.config.image_token_id,
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index bfa1b5bbaf84..4d310712f303 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -576,7 +576,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -593,8 +593,8 @@ def forward(
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
+            vision_embeddings = self.embed_multimodal(**kwargs)
+            inputs_embeds = self.embed_input_ids(
                 input_ids,
                 vision_embeddings,
                 is_multimodal=input_ids == self.config.image_token_index,
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index 4dfeddb0b28e..8a0bec9dff84 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -57,7 +57,7 @@ def __init__(
         if self.use_mup:
             self.input_mult = self.config.input_mult
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         embedding = self.embed_tokens(input_ids)
         if self.use_mup:
             embedding = embedding * self.input_mult
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index e799e41e2c38..19052c8d49e4 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -251,7 +251,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
 
         self.pooler = DispatchPooler({"plugin": DummyPooler()})
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
diff --git a/vllm/model_executor/models/transformers/__init__.py b/vllm/model_executor/models/transformers/__init__.py
index 365b5eb08893..93cd8ff50766 100644
--- a/vllm/model_executor/models/transformers/__init__.py
+++ b/vllm/model_executor/models/transformers/__init__.py
@@ -120,8 +120,8 @@ def __getattr__(name: str):
     """Handle imports of non-existent classes with a helpful error message."""
     if name not in globals():
         raise AttributeError(
-            "The Transformers backend does not currently have a class to handle "
-            f"the requested model type: {name}. Please open an issue at "
+            "The Transformers modeling backend does not currently have a class to "
+            f"handle the requested model type: {name}. Please open an issue at "
             "https://github.com/vllm-project/vllm/issues/new"
         )
     return globals()[name]
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index eb992f7bec72..f4ba4758bcc4 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend base class."""
+"""Transformers modeling backend base class."""
 
 from collections.abc import Iterable
 from typing import TYPE_CHECKING
@@ -118,7 +118,7 @@ def __init_subclass__(cls, *args, **kwargs):
 
     def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
         super().__init__()
-        logger.info("Using Transformers backend.")
+        logger.info("Using Transformers modeling backend.")
 
         self.config = vllm_config.model_config.hf_config
         self.text_config = self.config.get_text_config()
@@ -147,7 +147,8 @@ def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
             # Check for unsupported quantization methods.
             if quant_method_name == "mxfp4":
                 raise NotImplementedError(
-                    "Transformers backend does not support MXFP4 quantization yet."
+                    "Transformers modeling backend does "
+                    "not support MXFP4 quantization yet."
                 )
             # Skip loading extra bias for GPTQ models.
             if "gptq" in quant_method_name:
@@ -385,7 +386,7 @@ def _init_parameters(module: nn.Module, dtype: torch.dtype | None):
 
         _init_parameters(module, dtype)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         inputs_embeds = self.model.get_input_embeddings()(input_ids)
         if self.embed_scale is not None:
             inputs_embeds *= self.embed_scale
@@ -416,7 +417,7 @@ def forward(
             and input_ids is not None
             and inputs_embeds is None
         ):
-            inputs_embeds = self.get_input_embeddings(input_ids)
+            inputs_embeds = self.embed_input_ids(input_ids)
             input_ids = None
 
         if self.model_config.uses_mrope:
@@ -458,6 +459,6 @@ def check_version(min_version: str, feature: str):
         required = Version(min_version)
         if installed < required:
             raise ImportError(
-                f"Transformers backend requires transformers>={required} "
+                f"Transformers modeling backend requires transformers>={required} "
                 f"for {feature}, but got {installed}"
             )
diff --git a/vllm/model_executor/models/transformers/causal.py b/vllm/model_executor/models/transformers/causal.py
index 7f7b15a5675a..b2865ed0c7ff 100644
--- a/vllm/model_executor/models/transformers/causal.py
+++ b/vllm/model_executor/models/transformers/causal.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for causal language models."""
+"""Transformers modeling backend mixin for causal language models."""
 
 from typing import TYPE_CHECKING
 
@@ -42,7 +42,6 @@ def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
             self.skip_prefixes.append("lm_head.")
 
         if self.pp_group.is_last_rank:
-            self.unpadded_vocab_size = self.text_config.vocab_size
             self.lm_head = ParallelLMHead(
                 self.text_config.vocab_size,
                 self.text_config.hidden_size,
@@ -56,7 +55,7 @@ def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
 
             logit_scale = getattr(self.text_config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, self.text_config.vocab_size, logit_scale
+                self.text_config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/transformers/legacy.py b/vllm/model_executor/models/transformers/legacy.py
index a453870a2687..aca630be5615 100644
--- a/vllm/model_executor/models/transformers/legacy.py
+++ b/vllm/model_executor/models/transformers/legacy.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for legacy models."""
+"""Transformers modeling backend mixin for legacy models."""
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
index 8e39eb0b9902..4973014c3d4e 100644
--- a/vllm/model_executor/models/transformers/moe.py
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for Mixture of Experts (MoE) models."""
+"""Transformers modeling backend mixin for Mixture of Experts (MoE) models."""
 
 from typing import TYPE_CHECKING, Any
 
@@ -39,7 +39,7 @@
 
 @CustomOp.register("transformers_fused_moe")
 class TransformersFusedMoE(FusedMoE):
-    """Custom FusedMoE for the Transformers backend."""
+    """Custom FusedMoE for the Transformers modeling backend."""
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index 10abd8659536..ccf605371987 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for multi-modal models."""
+"""Transformers modeling backend mixin for multi-modal models."""
 
 from collections.abc import Mapping
 from typing import TYPE_CHECKING
@@ -27,6 +27,7 @@
 from vllm.multimodal import MultiModalKwargsItems
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
+    MultiModalFeatureSpec,
     MultiModalFieldConfig,
     MultiModalInputs,
     MultiModalUUIDDict,
@@ -38,7 +39,7 @@
 from vllm.sequence import IntermediateTensors
 
 if TYPE_CHECKING:
-    from transformers import BatchFeature, PretrainedConfig
+    from transformers import BatchFeature
 
     from vllm.config import VllmConfig
     from vllm.config.multimodal import BaseDummyOptions
@@ -309,9 +310,9 @@ def forward(
         return model_output
 
     def get_language_model(self) -> torch.nn.Module:
-        """Transformers backend multimodal classes do not contain a separate vLLM
-        language model class. Therefore, in order to return a language model vLLM class,
-        we use a wrapper to give `self` the same interface as a text model."""
+        """Transformers modeling backend multimodal classes do not contain a separate
+        vLLM language model class. Therefore, in order to return a language model vLLM
+        class, we use a wrapper to give `self` the same interface as a text model."""
 
         # Exclude self and object
         bases = self.__class__.mro()[1:-1]
@@ -329,7 +330,7 @@ def __init__(self, multimodal_model):
 
         return LanguageModel(self)
 
-    def get_multimodal_embeddings(self, **kwargs):
+    def embed_multimodal(self, **kwargs):
         pixel_values: torch.Tensor | None = kwargs.pop("pixel_values", None)
         image_embeds: torch.Tensor | None = kwargs.pop("image_embeds", None)
         # Model might use `image_patches` instead of `pixel_values`
@@ -367,22 +368,36 @@ def get_multimodal_embeddings(self, **kwargs):
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
-        hf_config: "PretrainedConfig",
-        image_grid_thw: list[list[int]] | torch.Tensor | None,
-        video_grid_thw: list[list[int]] | torch.Tensor | None,
-        second_per_grid_ts: list[float] | None = None,
-        context_len: int = 0,
-        seq_len: int | None = None,
-        audio_feature_lengths: torch.Tensor | None = None,
-        use_audio_in_video: bool = False,
+        mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        if any((second_per_grid_ts, audio_feature_lengths, use_audio_in_video)):
-            raise NotImplementedError("Transformers backend only supports images.")
-
-        if isinstance(image_grid_thw, list):
-            image_grid_thw = torch.tensor(image_grid_thw)
-        if isinstance(video_grid_thw, list):
-            video_grid_thw = torch.tensor(video_grid_thw)
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        if any(
+            v
+            for k, v in kwargs.items()
+            if k not in {"image_grid_thw", "video_grid_thw"}
+        ):
+            raise NotImplementedError(
+                "Transformers modeling backend only supports images."
+            )
+
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
+            image_grid_thw
+        )
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
+            video_grid_thw
+        )
 
         mrope_positions, mrope_position_delta = self.model.get_rope_index(
             input_ids=torch.tensor(input_tokens).unsqueeze(0),
@@ -390,7 +405,7 @@ def get_mrope_input_positions(
             video_grid_thw=video_grid_thw,
         )
 
-        mrope_positions = mrope_positions[:, 0, context_len:seq_len]
+        mrope_positions = mrope_positions[:, 0]
         mrope_position_delta = mrope_position_delta[0].item()
 
         return mrope_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/transformers/pooling.py b/vllm/model_executor/models/transformers/pooling.py
index 8117bbac013e..4c2a74bccb6a 100644
--- a/vllm/model_executor/models/transformers/pooling.py
+++ b/vllm/model_executor/models/transformers/pooling.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixins for pooling models."""
+"""Transformers modeling backend mixins for pooling models."""
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py
index 267a6e06e6bb..517eb54d53ac 100644
--- a/vllm/model_executor/models/transformers/utils.py
+++ b/vllm/model_executor/models/transformers/utils.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend utilities."""
+"""Transformers modeling backend utilities."""
 
 from contextlib import contextmanager
 from pathlib import Path
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 95d574fb81d7..bb0f6bd036f1 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -579,14 +579,14 @@ def _process_audio_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return []
         audio_embeddings = self._process_audio_input(audio_input)
         return audio_embeddings
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -597,9 +597,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
-            return super().get_input_embeddings(input_ids)
+            return super().embed_input_ids(input_ids)
 
-        return super().get_input_embeddings(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index e5ebd8138b0a..ca5af358e2ee 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -18,7 +18,11 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import supports_any_eagle
 from vllm.multimodal import NestedTensors
 from vllm.sequence import IntermediateTensors
 from vllm.utils.math_utils import cdiv
@@ -117,9 +121,10 @@ class AutoWeightsLoader:
     environment variable `VLLM_LOGGING_LEVEL=DEBUG`.
     """
 
-    # Models trained using early version ColossalAI
-    # may include these tensors in checkpoint. Skip them.
+    # Models trained using early version ColossalAI or quantized by
+    # GPTQModel may include these tensors in checkpoint. Skip them.
     ROTARY_EMBEDS_UNUSED_WEIGHTS = [
+        "rotary_pos_emb.inv_freq",
         "rotary_emb.inv_freq",
         "rotary_emb.cos_cached",
         "rotary_emb.sin_cached",
@@ -474,7 +479,7 @@ def _merge_multimodal_embeddings(
 
 @deprecated(
     "`merge_multimodal_embeddings` has been replaced with "
-    "`SupportsMultiModal.get_input_embeddings` and will be "
+    "`SupportsMultiModal.embed_input_ids` and will be "
     "removed in v0.12."
 )
 def merge_multimodal_embeddings(
@@ -713,6 +718,30 @@ def maybe_prefix(prefix: str, name: str) -> str:
     return name if not prefix else f"{prefix}.{name}"
 
 
+def get_draft_quant_config(
+    vllm_config: VllmConfig,
+) -> QuantizationConfig | None:
+    """Get quantization config for Draft models.
+
+    Draft models should use their own quantization config instead of the verifier/target
+    model's config. This helper retrieves the draft model's quantization config.
+
+    Args:
+        vllm_config: The vLLM configuration object.
+
+    Returns:
+        The draft model's config if available, None otherwise.
+    """
+    draft_model_config = vllm_config.speculative_config.draft_model_config
+    draft_load_config = vllm_config.load_config
+
+    return (
+        VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
+        if draft_model_config
+        else None
+    )
+
+
 def extract_layer_index(layer_name: str, num_attn_module: int = 1) -> int:
     """
     Extract the layer index from the module name.
@@ -824,3 +853,25 @@ def sequence_parallel_chunk_impl_fake(x: torch.Tensor) -> torch.Tensor:
     fake_impl=sequence_parallel_chunk_impl_fake,
     tags=(torch.Tag.needs_fixed_stride_order,),
 )
+
+
+def process_eagle_weight(
+    model: nn.Module,
+    name: str,
+) -> None:
+    """
+    Update EAGLE model flags based on loaded weight name.
+    This should be called during weight loading to detect if a model
+    has its own lm_head or embed_tokens weight.
+    Args:
+        model: The model instance (must support EAGLE)
+        name: The name of the weight to process
+    """
+    if not supports_any_eagle(model):
+        return
+
+    # To prevent overriding with target model's layers
+    if "lm_head" in name:
+        model.has_own_lm_head = True
+    if "embed_tokens" in name:
+        model.has_own_embed_tokens = True
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 9f94387c700d..e5d70eb7bc2f 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -10,7 +10,7 @@
 import torch
 from transformers import PretrainedConfig
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import VllmConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
@@ -83,8 +83,8 @@ def get_vit_attn_backend(
     head_size: int,
     dtype: torch.dtype,
     *,
-    attn_backend_override: _Backend | None = None,
-) -> _Backend:
+    attn_backend_override: AttentionBackendEnum | None = None,
+) -> AttentionBackendEnum:
     """
     Get the available attention backend for Vision Transformer.
     """
@@ -94,7 +94,7 @@ def get_vit_attn_backend(
     # Lazy import to avoid circular dependency
     from vllm.attention.selector import get_env_variable_attn_backend
 
-    selected_backend: _Backend | None = get_env_variable_attn_backend()
+    selected_backend: AttentionBackendEnum | None = get_env_variable_attn_backend()
     if selected_backend is not None:
         return selected_backend
 
@@ -550,19 +550,3 @@ def get_llm_pos_ids_for_vision(
     llm_pos_ids_list.append(_llm_pos_ids + start_idx)
     llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
     return llm_pos_ids
-
-
-# Due to a performance regression with Conv3D in PyTorch2.9, we reshape
-# Conv3D weights to Linear weights for better performance.
-# See: https://github.com/vllm-project/vllm/issues/27406
-# and https://github.com/pytorch/pytorch/issues/166122
-# FIXME(Isotr0py): Revert the PR introduces this workaround
-# (https://github.com/vllm-project/vllm/pull/27418),
-# once the performance issue is resolved in PyTorch.
-def conv3d_to_linear_weight(conv3d_weight: torch.Tensor) -> torch.Tensor:
-    """
-    Reshape Conv3D weight to Linear weight. Only work when kernel_size==stride.
-    """
-    out_channels, in_channels, kt, kh, kw = conv3d_weight.shape
-    linear_weight = conv3d_weight.reshape(out_channels, in_channels * kt * kh * kw)
-    return linear_weight
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index cce18984b67e..18ad8851fccd 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -399,7 +399,7 @@ def forward(
 
         return hidden_states
 
-    def get_multimodal_embeddings(
+    def embed_multimodal(
         self, **kwargs
     ) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None:
         audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index ccfe1871ef07..91a10b95a08c 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -13,7 +13,6 @@
     BatchFeature,
     WhisperConfig,
     WhisperFeatureExtractor,
-    WhisperProcessor,
 )
 from transformers.models.whisper.modeling_whisper import sinusoids
 
@@ -571,7 +570,7 @@ def forward(
         positions: torch.Tensor,
         encoder_hidden_states: torch.Tensor | None,
     ):
-        inputs_embeds = self.get_input_embeddings(input_ids)
+        inputs_embeds = self.embed_input_ids(input_ids)
         positions = self.embed_positions(positions)
         hidden_states = inputs_embeds + positions
 
@@ -584,7 +583,7 @@ def forward(
         hidden_states = self.layer_norm(hidden_states)
         return hidden_states
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
 
@@ -660,16 +659,6 @@ class WhisperProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self) -> WhisperConfig:
         return self.ctx.get_hf_config(WhisperConfig)
 
-    def get_hf_processor(self, **kwargs: object) -> WhisperProcessor:
-        # HACK: Transformers 4.53.2 has issue with whisper tokenizer to
-        # initialize processor. We use a monkeypatch to fix it here.
-        # See: https://github.com/vllm-project/vllm/issues/20224
-        processor_class = WhisperProcessor
-        tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast")
-        if processor_class.tokenizer_class != tokenizer_class:
-            processor_class.tokenizer_class = tokenizer_class
-        return self.ctx.get_hf_processor(processor_class, **kwargs)
-
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": 1}
 
@@ -890,7 +879,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.dtype = vllm_config.model_config.dtype
 
         self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix)
-        self.unpadded_vocab_size = config.vocab_size
+
         self.proj_out = ParallelLMHead(
             config.vocab_size,
             config.d_model,
@@ -899,9 +888,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.proj_out = self.proj_out.tie_weights(self.model.decoder.embed_tokens)
         logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, logit_scale
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
 
     def forward(
         self,
@@ -920,12 +907,12 @@ def forward(
     def get_language_model(self) -> torch.nn.Module:
         return self.model.decoder
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         # Required as part of SupportsMultiModal interface.
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         return [self.model.get_encoder_outputs(audio_input["input_features"])]
 
-    def get_input_embeddings(
+    def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
@@ -935,7 +922,7 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         # This method just returns the decoder sequence embeddings since
         # Whisper does not have encoder text tokens.
-        return self.model.decoder.get_input_embeddings(input_ids)
+        return self.model.decoder.embed_input_ids(input_ids)
 
     def _parse_and_validate_audio_input(self, **kwargs: object) -> WhisperAudioInputs:
         input_features = kwargs.pop("input_features", None)
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index bc1351600a2f..64e6979c8fcf 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -38,7 +38,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -692,19 +691,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         assert not is_lora_enabled
 
         self.config = config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
 
         # Initialize token embeddings
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         # Map hybrid layer indices to block indices
@@ -763,7 +756,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         # Final layer normalization
         self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Convert input token IDs to embeddings.
 
         Args:
@@ -793,7 +786,7 @@ def forward(
         """
         # Handle pipeline parallelism for first rank
         if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings(input_ids)
+            inputs_embeds = self.embed_input_ids(input_ids)
         hidden_states = inputs_embeds
 
         # Process through layers
@@ -911,7 +904,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
                 (not supported by Mamba)
         """
         config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
+
         scheduler_config = vllm_config.scheduler_config
 
         super().__init__()
@@ -919,9 +912,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.vllm_config = vllm_config
         self.scheduler_config = scheduler_config
         self.model_config = vllm_config.model_config
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
 
         # Initialize core model
         self.model = Zamba2Model(
@@ -930,32 +920,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         # Initialize language modeling head
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         # Tie weights with input embeddings if using same dimensions
         self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
 
         # Initialize logits processing and sampling
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Convert input token IDs to embeddings.
         Args:
             input_ids: Tensor of input token IDs
         Returns:
             Embedded representation of the input tokens
         """
-        return self.model.get_input_embeddings(input_ids)
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index bdcebd498ef0..e0c584df8760 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -148,6 +148,9 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:
 
 
 def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
+    if not (envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM):
+        return False
+
     if not isinstance(module, FusedMoE):
         return False
 
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 28792338f036..95f5982bc8c7 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -49,10 +49,18 @@ def _is_flashinfer_backend(backend):
         except NotImplementedError:
             return False
 
-    if not worker.model_runner.is_pooling_model and all(
-        _is_flashinfer_backend(group.backend)
-        for groups in worker.model_runner.attn_groups
-        for group in groups
+    # NOTE: we add check for empty attn_groups to avoid errors when
+    # deploying models such as E instances and encoder-only models.
+    # As for those models, worker.model_runner.attn_groups is empty.
+    # This change is made during EPD feature development.
+    if (
+        not worker.model_runner.is_pooling_model
+        and worker.model_runner.attn_groups
+        and all(
+            _is_flashinfer_backend(group.backend)
+            for groups in worker.model_runner.attn_groups
+            for group in groups
+        )
     ):
         logger.info("Warming up FlashInfer attention.")
         # Warmup with mixed batch containing both prefill and decode tokens
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index a05f54191f04..7518a023c5f5 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -249,6 +249,19 @@ class MultiModalFeatureSpec:
     mm_position: PlaceholderRange
     """e.g., PlaceholderRange(offset=2, length=336)"""
 
+    @staticmethod
+    def gather_kwargs(features: list["MultiModalFeatureSpec"], keys: set[str]):
+        kwargs = defaultdict[str, list[NestedTensors]](list)
+
+        for f in features:
+            item = f.data
+            if item is not None:
+                for k in keys:
+                    if k in item:
+                        kwargs[k].append(item[k].data)
+
+        return dict(kwargs)
+
 
 @dataclass
 class MultiModalFieldElem:
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 2fa3f6ebcc11..810f29072a0f 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -359,8 +359,9 @@ def __init__(
         )
         self.video_needs_metadata = video_needs_metadata
 
-    def _is_embeddings(
-        self, data: object
+    @classmethod
+    def is_embeddings(
+        cls, data: object
     ) -> TypeGuard[torch.Tensor | list[torch.Tensor]]:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
@@ -420,7 +421,7 @@ def _parse_audio_data(
         ):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return AudioEmbeddingItems(data)
 
         data_items: list[AudioItem]
@@ -458,7 +459,7 @@ def _parse_image_data(
         if self._is_empty(data):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return ImageEmbeddingItems(data)
 
         if (
@@ -484,7 +485,7 @@ def _parse_video_data(
         if self._is_empty(data):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return VideoEmbeddingItems(data)
 
         data_items: list[VideoItem]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 55132a6036ef..85a03efd5bb9 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -208,7 +208,7 @@ class PromptUpdateDetails(Generic[_S]):
     `None` (default) means to assign embeddings to all positions of `full`.
 
     The embeddings are obtained by calling
-    [`SupportsMultiModal.get_multimodal_embeddings`][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings].
+    [`SupportsMultiModal.embed_multimodal`][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal].
     """
 
     @staticmethod
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index badf72de4a90..a45ca988200d 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -38,7 +38,7 @@ def tpu_platform_plugin() -> str | None:
     # Check for Pathways TPU proxy
     if envs.VLLM_TPU_USING_PATHWAYS:
         logger.debug("Confirmed TPU platform is available via Pathways proxy.")
-        return "tpu_inference.platforms.tpu_jax.TpuPlatform"
+        return "tpu_inference.platforms.tpu_platform.TpuPlatform"
 
     # Check for libtpu installation
     try:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index ee904535ffe8..ed655912d396 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -8,7 +8,6 @@
 import subprocess
 import sys
 from dataclasses import dataclass
-from importlib.util import find_spec
 from typing import TYPE_CHECKING
 
 import regex as re
@@ -16,17 +15,16 @@
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import CpuArchEnum, Platform, PlatformEnum
 
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
-    _Backend = None
+    AttentionBackendEnum = None
     VllmConfig = None
 
 
@@ -127,28 +125,25 @@ def get_device_name(cls, device_id: int = 0) -> str:
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: "_Backend",
+        selected_backend: "AttentionBackendEnum",
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import _Backend
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
-        if selected_backend and selected_backend != _Backend.TORCH_SDPA:
+        if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
             raise NotImplementedError("MLA is not supported on CPU.")
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on CPU.")
-        logger.info("Using Torch SDPA backend.")
-        if not use_v1:
-            raise ValueError("CPU backend only supports V1.")
-        return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
+        return AttentionBackendEnum.CPU_ATTN.get_path()
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -186,20 +181,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         cache_config = vllm_config.cache_config
 
-        ipex_available = find_spec("intel_extension_for_pytorch") is not None
+        if cache_config.block_size is None:
+            cache_config.block_size = 128
 
-        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 128 if ipex_available else 16
-
-        if not ipex_available and cache_config.block_size != 16:
-            raise RuntimeError(
-                f"--block-size={cache_config.block_size} requires"
-                " intel_extension_for_pytorch"
+        if cache_config.block_size % 32 != 0:
+            logger.warning(
+                "CPU backend prefers block_size is multiples of 32, "
+                "otherwise the performance is not optimized."
             )
 
         scheduler_config = vllm_config.scheduler_config
         if (
-            scheduler_config.chunked_prefill_enabled
+            scheduler_config.enable_chunked_prefill
             or cache_config.enable_prefix_caching
         ) and cache_config.cache_dtype != "auto":
             raise RuntimeError(
@@ -207,22 +200,11 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "backend is not compatible with FP8 KV cache."
             )
 
-        if cache_config.cache_dtype == "fp8_e4m3":
-            cache_config.cache_dtype = "fp8_e5m2"
-            logger.warning(
-                "CPU backend doesn't support fp8_e4m3 KV cache type, cast to fp8_e5m2."
-            )
-
-        if (
-            cache_config.cache_dtype != "auto"
-            and model_config is not None
-            and model_config.dtype == torch.half
-        ):
+        if cache_config.cache_dtype != "auto":
             logger.warning(
-                "FP8 KV cache on the CPU backend only does not"
-                " support fp16 for now, cast to bf16."
+                "CPU backend doesn't support KV cache quantization fallback to auto."
             )
-            model_config.dtype = torch.bfloat16
+            cache_config.cache_dtype = "auto"
 
         cache_config.cpu_kvcache_space_bytes = CpuPlatform.get_device_total_memory()
 
@@ -356,10 +338,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "prefill and prefix caching to be disabled."
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                vllm_config.model_config.max_model_len,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
     @classmethod
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 32734c3aba5e..2e4dd8bb808b 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -22,10 +22,13 @@
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
+    from vllm.config.cache import CacheDType
 else:
-    _Backend = None
+    AttentionBackendEnum = None
+    VllmConfig = None
+    CacheDType = None
 
 logger = init_logger(__name__)
 
@@ -39,6 +42,49 @@
 torch.backends.cuda.enable_cudnn_sdp(False)
 
 
+@cache
+def _get_backend_priorities(
+    use_mla: bool,
+    device_capability: DeviceCapability,
+) -> list[AttentionBackendEnum]:
+    """Get backend priorities with lazy import to avoid circular dependency."""
+    from vllm.attention.backends.registry import AttentionBackendEnum
+
+    if use_mla:
+        if device_capability.major == 10:
+            return [
+                AttentionBackendEnum.CUTLASS_MLA,
+                AttentionBackendEnum.FLASHINFER_MLA,
+                AttentionBackendEnum.FLASH_ATTN_MLA,
+                AttentionBackendEnum.FLASHMLA,
+                AttentionBackendEnum.TRITON_MLA,
+                AttentionBackendEnum.FLASHMLA_SPARSE,
+            ]
+        else:
+            return [
+                AttentionBackendEnum.FLASH_ATTN_MLA,
+                AttentionBackendEnum.FLASHMLA,
+                AttentionBackendEnum.FLASHINFER_MLA,
+                AttentionBackendEnum.TRITON_MLA,
+                AttentionBackendEnum.FLASHMLA_SPARSE,
+            ]
+    else:
+        if device_capability.major == 10:
+            return [
+                AttentionBackendEnum.FLASHINFER,
+                AttentionBackendEnum.FLASH_ATTN,
+                AttentionBackendEnum.TRITON_ATTN,
+                AttentionBackendEnum.FLEX_ATTENTION,
+            ]
+        else:
+            return [
+                AttentionBackendEnum.FLASH_ATTN,
+                AttentionBackendEnum.FLASHINFER,
+                AttentionBackendEnum.TRITON_ATTN,
+                AttentionBackendEnum.FLEX_ATTENTION,
+            ]
+
+
 def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
     @wraps(fn)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
@@ -216,217 +262,174 @@ def get_current_memory_usage(
         return torch.cuda.max_memory_allocated(device)
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend":
-        from vllm.attention.backends.registry import _Backend
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> "AttentionBackendEnum":
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         # For Blackwell GPUs, force TORCH_SDPA for now.
         # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
         if cls.has_device_capability(100):
-            return _Backend.TORCH_SDPA
+            return AttentionBackendEnum.TORCH_SDPA
 
         if dtype not in (torch.float16, torch.bfloat16):
-            return _Backend.XFORMERS
+            return AttentionBackendEnum.XFORMERS
 
         if cls.has_device_capability(80):
-            FLASH_ATTN_V1 = (
-                "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-            )
-            from vllm.attention.selector import is_attn_backend_supported
-
-            is_default_fa_supported = is_attn_backend_supported(
-                FLASH_ATTN_V1, head_size, dtype, allow_import_error=False
-            )
-            if is_default_fa_supported:
-                return _Backend.FLASH_ATTN
+            backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
+            if backend_class.supports_head_size(
+                head_size
+            ) and backend_class.supports_dtype(dtype):
+                return AttentionBackendEnum.FLASH_ATTN
             else:
-                # Fallback to XFORMERS
-                return _Backend.XFORMERS
+                return AttentionBackendEnum.XFORMERS
         else:
             # Fallback for Volta/Turing GPUs or FA not supported
-            return _Backend.XFORMERS
+            return AttentionBackendEnum.XFORMERS
 
     @classmethod
-    def get_attn_backend_cls(
+    def get_valid_backends(
         cls,
-        selected_backend,
         head_size,
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
         use_mla,
         has_sink,
         use_sparse,
-    ) -> str:
-        from vllm.attention.backends.registry import _Backend
-
-        if use_mla:
-            # explicitly reject non-MLA backends when MLA is enabled to avoid
-            # silently selecting an incompatible backend (e.g., FLASHINFER).
-            if selected_backend in {
-                _Backend.FLASHINFER,
-                _Backend.FLASH_ATTN,
-                _Backend.TRITON_ATTN,
-                _Backend.TREE_ATTN,
-                _Backend.XFORMERS,
-            }:
-                raise ValueError(
-                    f"Attention backend {selected_backend} incompatible with MLA. "
-                    "Please use one of the MLA backends: FLASHINFER_MLA, CUTLASS_MLA, "
-                    "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set "
-                    "VLLM_MLA_DISABLE=1 to disable MLA for this model."
+        device_capability,
+        attn_type,
+    ) -> tuple[
+        list[tuple["AttentionBackendEnum", int]],
+        dict["AttentionBackendEnum", list[str]],
+    ]:
+        valid_backends_priorities = []
+        invalid_reasons = {}
+
+        backend_priorities = _get_backend_priorities(use_mla, device_capability)
+        for priority, backend in enumerate(backend_priorities):
+            try:
+                backend_class = backend.get_class()
+                invalid_reasons_i = backend_class.validate_configuration(
+                    head_size,
+                    dtype,
+                    kv_cache_dtype,
+                    block_size,
+                    use_mla,
+                    has_sink,
+                    use_sparse,
+                    device_capability,
+                    attn_type,
                 )
+            except ImportError:
+                invalid_reasons_i = ["ImportError"]
+            if invalid_reasons_i:
+                invalid_reasons[backend] = invalid_reasons_i
+            else:
+                valid_backends_priorities.append((backend, priority))
 
-            from vllm.attention.ops.flashmla import is_flashmla_dense_supported
-            from vllm.attention.utils.fa_utils import flash_attn_supports_mla
+        return valid_backends_priorities, invalid_reasons
 
-            if use_sparse:
-                logger.info_once("Using Sparse MLA backend.")
-                return (
-                    "vllm.v1.attention.backends.mla.flashmla_sparse."
-                    "FlashMLASparseBackend"
+    @classmethod
+    def get_attn_backend_cls(
+        cls,
+        selected_backend: "AttentionBackendEnum",
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        attn_type: str | None = None,
+    ) -> str:
+        from vllm.attention import AttentionType
+
+        if attn_type is None:
+            attn_type = AttentionType.DECODER
+
+        device_capability = cls.get_device_capability()
+        assert device_capability is not None
+
+        # First try checking just the selected backend, if there is one.
+        if selected_backend is not None:
+            try:
+                backend_class = selected_backend.get_class()
+                invalid_reasons = backend_class.validate_configuration(
+                    head_size,
+                    dtype,
+                    kv_cache_dtype,
+                    None,
+                    use_mla,
+                    has_sink,
+                    use_sparse,
+                    device_capability,
+                    attn_type,
                 )
-
-            use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or (
-                selected_backend is None
-                and cls.is_device_capability(100)
-                and block_size % 128 == 0
-            )
-            use_flashinfermla = selected_backend == _Backend.FLASHINFER_MLA or (
-                selected_backend is None
-                and cls.is_device_capability(100)
-                and (block_size == 32 or block_size % 64 == 0)
-            )
-            use_flashmla = selected_backend == _Backend.FLASHMLA or (
-                selected_backend is None and is_flashmla_dense_supported()[0]
-            )
-            use_flashattn = selected_backend == _Backend.FLASH_ATTN_MLA or (
-                selected_backend is None and flash_attn_supports_mla()
+            except ImportError:
+                invalid_reasons = ["ImportError"]
+            if invalid_reasons:
+                raise ValueError(
+                    f"Selected backend {selected_backend} is not valid for "
+                    f"this configuration. Reason: {invalid_reasons}"
+                )
+            else:
+                logger.info("Using %s backend.", selected_backend)
+                return selected_backend.get_path()
+
+        # No selected backend or the selected backend is invalid,
+        # so we try finding a valid backend.
+        valid_backends_priorities, invalid_reasons = cls.get_valid_backends(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            None,
+            use_mla,
+            has_sink,
+            use_sparse,
+            device_capability,
+            attn_type,
+        )
+        reasons_str = (
+            "{"
+            + ", ".join(
+                f"{backend.name}: [{', '.join(reasons)}]"
+                for backend, reasons in invalid_reasons.items()
             )
-            use_triton = selected_backend == _Backend.TRITON_MLA or (
-                selected_backend is None
+            + "}"
+        )
+        config_str = (
+            f"head_size: {head_size}, dtype: {dtype}, "
+            f"kv_cache_dtype: {kv_cache_dtype}, block_size: {block_size}, "
+            f"use_mla: {use_mla}, has_sink: {has_sink}, use_sparse: {use_sparse}"
+        )
+        logger.debug_once(
+            f"Some attention backends are not valid for {cls.device_name} with "
+            f"{config_str}. Reasons: {reasons_str}."
+        )
+        if len(valid_backends_priorities) == 0:
+            raise ValueError(
+                f"No valid attention backend found for {cls.device_name} "
+                f"with {config_str}. Reasons: {reasons_str}."
             )
 
-            if use_cutlassmla:
-                logger.info_once("Using Cutlass MLA backend.", scope="local")
-                return "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
-            if use_flashinfermla:
-                from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-                set_kv_cache_layout("HND")
-                logger.info_once("Using FlashInfer MLA backend.")
-                return (
-                    "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
-                )
-            if use_flashmla:
-                if block_size % 64 != 0:
-                    logger.warning(
-                        "FlashMLA backend is not supported for block size %d"
-                        " (currently only supports block size 64).",
-                        block_size,
-                    )
-                else:
-                    logger.info_once("Using FlashMLA backend.")
-                    return "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend"
-            if use_flashattn:
-                logger.info_once("Using FlashAttention MLA backend.")
-                return (
-                    "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
-                )
-            if use_triton:
-                logger.info_once("Using Triton MLA backend.")
-                return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
-
-        FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
-        FLEX_ATTENTION_V1 = (
-            "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
+        # We have found some valid backends. Select the one with the
+        # highest priority.
+        logger.info(
+            "Valid backends: %s", [b[0].name for b in valid_backends_priorities]
         )
-        TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
-        FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-        TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"  # noqa: E501
-        XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"  # noqa: E501
-
-        use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith(
-            "fp8"
+        sorted_indices = sorted(
+            range(len(valid_backends_priorities)),
+            key=lambda i: valid_backends_priorities[i][1],
         )
-
-        if selected_backend == _Backend.FLASHINFER:
-            logger.info_once("Using FlashInfer backend.")
-            if cls.has_device_capability(100):
-                from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-                set_kv_cache_layout("HND")
-            return FLASHINFER_V1
-        elif selected_backend == _Backend.FLEX_ATTENTION:
-            logger.info_once("Using FlexAttention backend.")
-            return FLEX_ATTENTION_V1
-        elif selected_backend == _Backend.TRITON_ATTN:
-            logger.info_once("Using Triton backend.")
-            return TRITON_ATTN
-        elif selected_backend == _Backend.FLASH_ATTN:
-            logger.info_once("Using Flash Attention backend.")
-            return FLASH_ATTN_V1
-        elif selected_backend == _Backend.TREE_ATTN:
-            logger.info_once("Using Tree Attention backend.")
-            return TREE_ATTN_V1
-        elif selected_backend == _Backend.XFORMERS:
-            logger.info_once("Using XFormers backend.")
-            return XFORMERS_V1
-
-        from vllm.attention.selector import is_attn_backend_supported
-
-        # Default backends for V1 engine
-        # Prefer FlashInfer for Blackwell GPUs if installed
-        if cls.is_device_capability(100):
-            if is_default_backend_supported := is_attn_backend_supported(
-                FLASHINFER_V1, head_size, dtype
-            ):
-                from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-                logger.info_once(
-                    "Using FlashInfer backend with HND KV cache layout on "
-                    "V1 engine by default for Blackwell (SM 10.0) GPUs."
-                )
-                set_kv_cache_layout("HND")
-
-                return FLASHINFER_V1
-
-            if not is_default_backend_supported.can_import:
-                logger.warning_once(
-                    "FlashInfer failed to import on Blackwell (SM 10.0) GPUs; "
-                    "it is recommended to install FlashInfer for better "
-                    "performance."
-                )
-
-        # FlashAttention is the default for SM 8.0+ GPUs
-        if cls.has_device_capability(80):
-            if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90):
-                logger.info_once("Using Triton backend.")
-                return TRITON_ATTN
-            elif is_default_backend_supported := is_attn_backend_supported(
-                FLASH_ATTN_V1, head_size, dtype, allow_import_error=False
-            ):
-                logger.info_once("Using Flash Attention backend.")
-                return FLASH_ATTN_V1
-
-        # FlexAttention is the default for older GPUs
-        else:
-            logger.info_once("Using FlexAttention backend.")
-            return FLEX_ATTENTION_V1
-
-        assert not is_default_backend_supported
-
-        use_flex_attention_reason = {}
-        if not is_default_backend_supported.head_size:
-            use_flex_attention_reason["head_size"] = head_size
-        if not is_default_backend_supported.dtype:
-            use_flex_attention_reason["dtype"] = dtype
-
-        logger.info_once(
-            "Using FlexAttention backend for %s.",
-            ", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()),
+        selected_index = sorted_indices[0]
+        selected_backend = valid_backends_priorities[selected_index][0]
+        logger.info(
+            "Using %s backend.",
+            selected_backend.name,
         )
-        return FLEX_ATTENTION_V1
+
+        return selected_backend.get_path()
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 15e3b3a22bde..0471c20429b1 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -17,8 +17,9 @@
 if TYPE_CHECKING:
     from torch.distributed import PrefixStore, ProcessGroup
 
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
+    from vllm.config.cache import CacheDType
     from vllm.inputs import ProcessorInputs, PromptType
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
@@ -58,6 +59,31 @@ class DeviceCapability(NamedTuple):
     major: int
     minor: int
 
+    def __lt__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) < (other.major, other.minor)
+
+    def __le__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) <= (other.major, other.minor)
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) == (other.major, other.minor)
+
+    def __ge__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) >= (other.major, other.minor)
+
+    def __gt__(self, other: Any) -> bool:
+        if not isinstance(other, DeviceCapability):
+            return NotImplemented
+        return (self.major, self.minor) > (other.major, other.minor)
+
     def as_version_str(self) -> str:
         return f"{self.major}.{self.minor}"
 
@@ -145,7 +171,11 @@ def is_cuda_alike(self) -> bool:
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     def is_sleep_mode_available(self) -> bool:
-        return self._enum == PlatformEnum.CUDA
+        # TODO: Actually only mi3xx has the sleep mode support now
+        # for ROCm, but currently we don't have a way to detect the
+        # exact GPU model statelessly here. So we return True for
+        # all ROCm platforms for now.
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     @classmethod
     def device_id_to_physical_device_id(cls, device_id: int):
@@ -173,24 +203,26 @@ def import_kernels(cls) -> None:
             import vllm._moe_C  # noqa: F401
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend":
-        # Import _Backend here to avoid circular import.
-        from vllm.attention.backends.registry import _Backend
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> "AttentionBackendEnum":
+        # Import AttentionBackendEnum here to avoid circular import.
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
-        return _Backend.TORCH_SDPA
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: "_Backend",
+        selected_backend: "AttentionBackendEnum",
         head_size: int,
         dtype: torch.dtype,
-        kv_cache_dtype: str | None,
+        kv_cache_dtype: "CacheDType | None",
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        attn_type: str | None = None,
     ) -> str:
         """Get the attention backend class of a device."""
         return ""
@@ -514,13 +546,6 @@ def get_global_graph_pool(self) -> Any:
             cls._global_graph_pool = self.graph_pool_handle()
         return cls._global_graph_pool
 
-    @classmethod
-    def get_cu_count(cls, device_id: int = 0) -> int:
-        """
-        Returns the total number of compute units (CU) on single GPU.
-        """
-        raise NotImplementedError
-
     @classmethod
     def get_static_graph_wrapper_cls(cls) -> str:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1abd6300036d..788f9d69c357 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -14,10 +14,10 @@
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
+    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
-    _Backend = None
+    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -49,25 +49,8 @@
 
 # Models partially supported by ROCm.
 # Architecture -> Reason.
-_ROCM_SWA_REASON = (
-    "Sliding window attention (SWA) is not yet supported in "
-    "Triton flash attention. For half-precision SWA support, "
-    "please use CK flash attention by setting "
-    "`VLLM_USE_TRITON_FLASH_ATTN=0`"
-)
-_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {
-    "Qwen2ForCausalLM": _ROCM_SWA_REASON,
-    "MistralForCausalLM": _ROCM_SWA_REASON,
-    "MixtralForCausalLM": _ROCM_SWA_REASON,
-    "PaliGemmaForConditionalGeneration": (
-        "ROCm flash attention does not yet fully support 32-bit precision on PaliGemma"
-    ),
-    "Phi3VForCausalLM": (
-        "ROCm Triton flash attention may run into compilation errors due to "
-        "excessive use of shared memory. If this happens, disable Triton FA "
-        "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`"
-    ),
-}
+_ROCM_SWA_REASON = ()
+_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {}
 _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
     "0x74a0": "AMD_Instinct_MI300A",
     "0x74a1": "AMD_Instinct_MI300X",
@@ -142,6 +125,8 @@ def use_rocm_custom_paged_attention(
     alibi_slopes: torch.Tensor | None = None,
     sinks: torch.Tensor | None = None,
 ) -> bool:
+    from vllm._aiter_ops import rocm_aiter_ops
+
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
     ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
     ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
@@ -157,7 +142,7 @@ def use_rocm_custom_paged_attention(
             and (gqa_ratio >= 1 and gqa_ratio <= 16)
             and max_seq_len <= 128 * 1024
             and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
-            and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN and envs.VLLM_ROCM_USE_AITER)
+            and not (rocm_aiter_ops.is_pa_attn_enabled())
             and sinks is None
         )
 
@@ -202,18 +187,23 @@ class RocmPlatform(Platform):
     ]
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend":
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> AttentionBackendEnum:
         from importlib.util import find_spec
 
-        from vllm.attention.backends.registry import _Backend
+        from vllm._aiter_ops import rocm_aiter_ops
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
-        if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
-            return _Backend.ROCM_AITER_FA
+        if rocm_aiter_ops.is_mha_enabled():
+            # Note: AITER FA is only supported for Qwen-VL models.
+            # TODO: Add support for other VL models in their model class.
+            return AttentionBackendEnum.ROCM_AITER_FA
 
         if on_gfx9() and find_spec("flash_attn") is not None:
-            return _Backend.FLASH_ATTN
+            return AttentionBackendEnum.FLASH_ATTN
 
-        return _Backend.TORCH_SDPA
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
     def get_attn_backend_cls(
@@ -223,71 +213,66 @@ def get_attn_backend_cls(
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
         use_mla,
         has_sink,
         use_sparse,
+        attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import _Backend
+        from vllm._aiter_ops import rocm_aiter_ops
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on ROCm.")
-        if use_mla:
-            from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
-                is_aiter_mla_enabled,
-            )
 
+        if use_mla:
             if selected_backend is None:
                 selected_backend = (
-                    _Backend.ROCM_AITER_MLA
-                    if is_aiter_mla_enabled() or block_size == 1
-                    else _Backend.TRITON_MLA
+                    AttentionBackendEnum.ROCM_AITER_MLA
+                    if rocm_aiter_ops.is_mla_enabled() or block_size == 1
+                    else AttentionBackendEnum.TRITON_MLA
                 )
 
-            if selected_backend == _Backend.TRITON_MLA:
+            if selected_backend == AttentionBackendEnum.TRITON_MLA:
                 if block_size != 1:
                     logger.info_once("Using Triton MLA backend.")
-                    return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
+                    return AttentionBackendEnum.TRITON_MLA.get_path()
                 raise ValueError(
                     f" The selected backend, {selected_backend.name},"
                     f"does not support block size {block_size}."
                 )
-            if selected_backend == _Backend.ROCM_AITER_MLA:
+            if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA:
                 logger.info("Using AITER MLA backend.")
-                return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                return AttentionBackendEnum.ROCM_AITER_MLA.get_path()
 
             raise ValueError(
                 f" The selected backend, {selected_backend.name},"
                 f"is not MLA type while requested for MLA backend."
             )
 
-        if selected_backend == _Backend.FLEX_ATTENTION:
+        if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
             logger.info("Using FlexAttention backend.")
             return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
         if (
-            envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9()
-        ) or selected_backend == _Backend.ROCM_AITER_FA:
+            rocm_aiter_ops.is_mha_enabled()
+        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
             logger.info("Using Aiter Flash Attention backend.")
-            return "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
+            return AttentionBackendEnum.ROCM_AITER_FA.get_path()
         if (
-            envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
-        ) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
+            rocm_aiter_ops.is_triton_unified_attn_enabled()
+        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
             logger.info("Using Aiter Unified Attention backend.")
-            return (
-                "vllm.v1.attention.backends."
-                "rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend"
-            )
+            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
         if (
             envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-            or selected_backend == _Backend.ROCM_ATTN
+            or selected_backend == AttentionBackendEnum.ROCM_ATTN
         ):
             # rocm specific backend, with aiter and/or
             #   triton prefix-prefill
             logger.info("Using Rocm Attention backend.")
-            return "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
+            return AttentionBackendEnum.ROCM_ATTN.get_path()
         # default case, using triton unified attention
         logger.info("Using Triton Attention backend.")
-        return "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
+        return AttentionBackendEnum.TRITON_ATTN.get_path()
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -341,6 +326,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
 
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        from vllm._aiter_ops import rocm_aiter_ops
         from vllm.config.compilation import CUDAGraphMode
 
         cache_config = vllm_config.cache_config
@@ -348,9 +334,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         parallel_config = vllm_config.parallel_config
         is_eager_execution = compilation_config == CUDAGraphMode.NONE
 
-        use_aiter_rms_norm = (
-            envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_RMSNORM
-        )
+        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
 
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
@@ -440,10 +424,6 @@ def use_custom_allreduce(cls) -> bool:
     def opaque_attention_op(cls) -> bool:
         return True
 
-    @classmethod
-    def get_cu_count(cls, device_id: int = 0) -> int:
-        return torch.cuda.get_device_properties(device_id).multi_processor_count
-
     @classmethod
     def is_navi(cls) -> bool:
         return "gfx1" in torch.cuda.get_device_properties(0).gcnArchName
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 1a4b67a1762f..944344a22957 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -9,22 +9,25 @@
 
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
-    from vllm.config import ModelConfig, VllmConfig
+    from typing import TypeAlias
+
+    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.config import VllmConfig
     from vllm.config.cache import BlockSize
     from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
+
+    ParamsType: TypeAlias = SamplingParams | PoolingParams
 else:
     BlockSize = None
-    ModelConfig = None
     VllmConfig = None
     PoolingParams = None
-    _Backend = None
+    AttentionBackendEnum = None
+    ParamsType = None
 
 logger = init_logger(__name__)
 
@@ -54,27 +57,25 @@ def import_kernels(cls) -> None:
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: "_Backend",
+        selected_backend: "AttentionBackendEnum",
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink,
         use_sparse,
+        attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import _Backend
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on TPU.")
-        if selected_backend != _Backend.PALLAS:
+        if selected_backend != AttentionBackendEnum.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
 
-        if not use_v1:
-            raise ValueError("TPU backend only supports V1.")
         logger.info("Using Pallas V1 backend.")
-        return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+        return AttentionBackendEnum.PALLAS.get_path()
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -189,10 +190,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "prefill and prefix caching to be disabled."
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                vllm_config.model_config.max_model_len,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
     @classmethod
@@ -208,10 +208,12 @@ def get_device_communicator_cls(cls) -> str:
     def validate_request(
         cls,
         prompt: PromptType,
-        params: SamplingParams | PoolingParams,
+        params: ParamsType,
         processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
+        from vllm.sampling_params import SamplingParams, SamplingType
+
         if (
             isinstance(params, SamplingParams)
             and params.sampling_type == SamplingType.RANDOM_SEED
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index e4ecd0c807da..65516827a16d 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -9,17 +9,15 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import _Backend
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.config import VllmConfig
 else:
-    ModelConfig = None
     VllmConfig = None
-    _Backend = None
+    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -44,15 +42,15 @@ def import_kernels(cls) -> None:
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: "_Backend",
+        selected_backend: "AttentionBackendEnum",
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink: bool,
         use_sparse,
+        attn_type: str | None = None,
     ) -> str:
         from vllm.v1.attention.backends.utils import set_kv_cache_layout
 
@@ -62,26 +60,24 @@ def get_attn_backend_cls(
             "only NHD layout is supported by XPU attention kernels."
         )
 
-        from vllm.attention.backends.registry import _Backend
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on XPU.")
-        TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
-        FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
-        if selected_backend == _Backend.TRITON_ATTN:
+        if selected_backend == AttentionBackendEnum.TRITON_ATTN:
             logger.info_once("Using Triton backend.")
-            return TRITON_ATTN
-        elif selected_backend == _Backend.FLASH_ATTN:
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+        elif selected_backend == AttentionBackendEnum.FLASH_ATTN:
             logger.info_once("Using Flash Attention backend.")
-            return FLASH_ATTN
+            return AttentionBackendEnum.FLASH_ATTN.get_path()
         elif selected_backend:
             raise ValueError(
                 f"Invalid attention backend for {cls.device_name}, "
-                f"with use_v1: {use_v1} use_mla: {use_mla}"
+                f"with use_mla: {use_mla}"
             )
 
         logger.info("Using Flash Attention backend.")
-        return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        return AttentionBackendEnum.FLASH_ATTN.get_path()
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -105,7 +101,11 @@ def get_device_name(cls, device_id: int = 0) -> str:
 
     @classmethod
     def get_punica_wrapper(cls) -> str:
-        return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
+        xpu_use_triton_kernel = os.getenv("XPU_USE_TRITON_KERNEL", "0") == "1"
+        if not xpu_use_triton_kernel:
+            return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
+        else:
+            return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -113,10 +113,12 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
         return device_props.total_memory
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> _Backend:
-        from vllm.attention.backends.registry import _Backend
+    def get_vit_attn_backend(
+        cls, head_size: int, dtype: torch.dtype
+    ) -> "AttentionBackendEnum":
+        from vllm.attention.backends.registry import AttentionBackendEnum
 
-        return _Backend.FLASH_ATTN
+        return AttentionBackendEnum.FLASH_ATTN
 
     @classmethod
     def inference_mode(cls):
@@ -182,10 +184,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "prefill and prefix caching to be disabled."
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                vllm_config.model_config.max_model_len,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
     @classmethod
diff --git a/vllm/plugins/lora_resolvers/README.md b/vllm/plugins/lora_resolvers/README.md
deleted file mode 100644
index 48f27dddea07..000000000000
--- a/vllm/plugins/lora_resolvers/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# LoRA Resolver Plugins
-
-This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters
-via the LoRAResolver plugin framework.
-
-Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
-to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
-
-## lora_filesystem_resolver
-
-This LoRA Resolver is installed with vLLM by default.
-To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
-for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
-for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will
-load that adapter, and then service the request as normal. That adapter will then be available
-for future requests as normal.
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 72a8320cc1bf..5c3dfa8ac9cb 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -57,6 +57,7 @@ class PoolingParams(
     ## Internal use only
     task: PoolingTask | None = None
     requires_token_ids: bool = False
+    skip_reading_prefix_cache: bool = None
     extra_kwargs: dict[str, Any] | None = None
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
@@ -93,6 +94,8 @@ def verify(
         # plugin task uses io_processor.parse_request to verify inputs,
         # skipping PoolingParams verify
         if self.task == "plugin":
+            if self.skip_reading_prefix_cache is None:
+                self.skip_reading_prefix_cache = True
             return
 
         # NOTE: Task validation needs to done against the model instance,
@@ -122,6 +125,15 @@ def _merge_default_parameters(
             if getattr(self, k, None) is None:
                 setattr(self, k, getattr(pooler_config, k))
 
+        if self.skip_reading_prefix_cache is None:
+            # If prefix caching is enabled,
+            # the output of all pooling may less than n_prompt_tokens,
+            # we need to skip reading cache at this request.
+            if self.task in ["token_embed", "token_classify"]:
+                self.skip_reading_prefix_cache = True
+            else:
+                self.skip_reading_prefix_cache = False
+
         self._verify_step_pooling(pooler_config, valid_parameters)
 
     def _verify_step_pooling(
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 4b2a3bc4dbaa..901d66163452 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -15,6 +15,7 @@
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.v1.serial_utils import PydanticMsgspecMixin
 
 logger = init_logger(__name__)
 
@@ -122,6 +123,7 @@ class RequestOutputKind(Enum):
 
 
 class SamplingParams(
+    PydanticMsgspecMixin,
     msgspec.Struct,
     omit_defaults=True,  # type: ignore[call-arg]
     # required for @cached_property.
@@ -252,6 +254,8 @@ class SamplingParams(
     generated token can complete the sequence."""
     _bad_words_token_ids: list[list[int]] | None = None
 
+    skip_reading_prefix_cache: bool = None
+
     @staticmethod
     def from_optional(
         n: int | None = 1,
@@ -412,6 +416,12 @@ def __post_init__(self) -> None:
             self.structured_outputs = self.guided_decoding
             self.guided_decoding = None
 
+        if self.skip_reading_prefix_cache is None:
+            # If prefix caching is enabled,
+            # the output of prompt logprobs may less than n_prompt_tokens,
+            # we need to skip reading cache at this request.
+            self.skip_reading_prefix_cache = self.prompt_logprobs is not None
+
     def _verify_args(self) -> None:
         if not isinstance(self.n, int):
             raise ValueError(f"n must be an int, but is of type {type(self.n)}")
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 6bcc94ad5c62..6d20ca9aac22 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -60,12 +60,17 @@ class IntermediateTensors:
     tensors: dict[str, torch.Tensor]
     kv_connector_output: KVConnectorOutput | None
 
-    def __init__(self, tensors):
+    def __init__(
+        self,
+        tensors: dict[str, torch.Tensor],
+        kv_connector_output: KVConnectorOutput | None = None,
+    ) -> None:
         # manually define this function, so that
         # Dynamo knows `IntermediateTensors()` comes from this file.
         # Otherwise, dataclass will generate this function by evaluating
         # a string, and we will lose the information about the source file.
         self.tensors = tensors
+        self.kv_connector_output = kv_connector_output
 
     def __getitem__(self, key: str | slice):
         if isinstance(key, str):
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 14cae2b168e1..ac4a71648cec 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -77,6 +77,7 @@ def __getitem__(self, key):
 
 
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
+    afmoe="AfmoeConfig",
     chatglm="ChatGLMConfig",
     deepseek_vl_v2="DeepseekVLV2Config",
     deepseek_v32=DeepseekV3Config,
@@ -472,11 +473,21 @@ def is_interleaved(config: PretrainedConfig) -> bool:
     """
     text_config = config.get_text_config()
     if layer_types := getattr(text_config, "layer_types", None):
-        interleaved_types = {"full_attention", "sliding_attention"}
-        return interleaved_types.issubset(layer_types)
+        return len(set(layer_types)) > 1
     return False
 
 
+def uses_custom_attention_masks(config: PretrainedConfig) -> bool:
+    """Detect if model uses custom attention mask generation for multimodal.
+
+    Some multimodal models require custom attention masks that enable
+    bidirectional attention between image tokens while maintaining causal
+    attention for text tokens. Currently applies to Gemma3 multimodal models.
+    """
+    architectures = getattr(config, "architectures", [])
+    return "Gemma3ForConditionalGeneration" in architectures
+
+
 def _maybe_update_auto_config_kwargs(kwargs: dict[str, Any], model_type: str):
     """
     Update kwargs for AutoConfig initialization based on model_type
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index ac612b255143..dcae05a15fec 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -7,6 +7,7 @@
 - There is a need to override the existing config to support vLLM.
 """
 
+from vllm.transformers_utils.configs.afmoe import AfmoeConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
@@ -40,6 +41,7 @@
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
+    "AfmoeConfig",
     "ChatGLMConfig",
     "DeepseekVLV2Config",
     "DotsOCRConfig",
diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py
new file mode 100644
index 000000000000..9b634fd037a3
--- /dev/null
+++ b/vllm/transformers_utils/configs/afmoe.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class AfmoeConfig(PretrainedConfig):
+    model_type = "afmoe"
+
+    def __init__(
+        self,
+        vocab_size: int = 200_192,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6144,
+        moe_intermediate_size: int = 1408,
+        num_hidden_layers: int = 32,
+        num_dense_layers: int = 1,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int | None = None,
+        head_dim: int = 128,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 131072,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-5,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 10000.0,
+        rope_scaling: dict | None = None,
+        num_experts: int = 64,
+        num_experts_per_tok: int = 6,
+        num_shared_experts: int = 2,
+        num_expert_groups: int = 1,
+        num_limited_groups: int = 1,
+        score_func: str = "sigmoid",
+        route_norm: bool = True,
+        route_scale: float = 1.0,
+        global_attn_every_n_layers: int = 4,
+        sliding_window: int = 2048,
+        layer_types: list[str] | None = None,
+        attention_dropout: float = 0.0,
+        mup_enabled: bool = False,
+        n_group: int = 1,
+        topk_group: int = 1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_dense_layers = num_dense_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads or num_attention_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_shared_experts = num_shared_experts
+        self.num_expert_groups = num_expert_groups
+        self.num_limited_groups = num_limited_groups
+        self.score_func = score_func
+        self.route_norm = route_norm
+        self.route_scale = route_scale
+
+        self.global_attn_every_n_layers = global_attn_every_n_layers
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        self.attention_dropout = attention_dropout
+
+        self.mup_enabled = mup_enabled
+        self.n_group = n_group
+        self.topk_group = topk_group
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["AfmoeConfig"]
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
new file mode 100644
index 000000000000..2bf59c91a3bb
--- /dev/null
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""GGUF utility functions."""
+
+from pathlib import Path
+
+import gguf
+from gguf.constants import Keys, VisionProjectorType
+from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def detect_gguf_multimodal(model: str) -> Path | None:
+    """Check if GGUF model has multimodal projector file.
+
+    Args:
+        model: Model path string
+
+    Returns:
+        Path to mmproj file if found, None otherwise
+    """
+    if not model.endswith(".gguf"):
+        return None
+
+    try:
+        model_path = Path(model)
+        if not model_path.is_file():
+            return None
+
+        model_dir = model_path.parent
+        mmproj_patterns = ["mmproj.gguf", "mmproj-*.gguf", "*mmproj*.gguf"]
+        for pattern in mmproj_patterns:
+            mmproj_files = list(model_dir.glob(pattern))
+            if mmproj_files:
+                return mmproj_files[0]
+        return None
+    except Exception:
+        return None
+
+
+def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | None":
+    """Extract vision config parameters from mmproj.gguf metadata.
+
+    Reads vision encoder configuration from GGUF metadata fields using
+    standardized GGUF constants. Automatically detects the projector type
+    (e.g., gemma3, llama4) and applies model-specific parameters accordingly.
+
+    The function extracts standard CLIP vision parameters from GGUF metadata
+    and applies projector-type-specific customizations. For unknown projector
+    types, it uses safe defaults from SiglipVisionConfig.
+
+    Args:
+        mmproj_path: Path to mmproj.gguf file (str or Path)
+
+    Returns:
+        SiglipVisionConfig if extraction succeeds, None if any required
+        field is missing from the GGUF metadata
+
+    Raises:
+        Exception: Exceptions from GGUF reading (file not found, corrupted
+            file, etc.) propagate directly from gguf.GGUFReader
+    """
+    reader = gguf.GGUFReader(str(mmproj_path))
+
+    # Detect projector type to apply model-specific parameters
+    projector_type = None
+    projector_type_field = reader.get_field(Keys.Clip.PROJECTOR_TYPE)
+    if projector_type_field:
+        try:
+            projector_type = bytes(projector_type_field.parts[-1]).decode("utf-8")
+        except (AttributeError, UnicodeDecodeError) as e:
+            logger.warning("Failed to decode projector type from GGUF: %s", e)
+
+    # Map GGUF field constants to SiglipVisionConfig parameters.
+    # Uses official GGUF constants from gguf-py for standardization.
+    # Format: {gguf_constant: (param_name, dtype)}
+    VISION_CONFIG_FIELDS = {
+        Keys.ClipVision.EMBEDDING_LENGTH: ("hidden_size", int),
+        Keys.ClipVision.FEED_FORWARD_LENGTH: ("intermediate_size", int),
+        Keys.ClipVision.BLOCK_COUNT: ("num_hidden_layers", int),
+        Keys.ClipVision.Attention.HEAD_COUNT: ("num_attention_heads", int),
+        Keys.ClipVision.IMAGE_SIZE: ("image_size", int),
+        Keys.ClipVision.PATCH_SIZE: ("patch_size", int),
+        Keys.ClipVision.Attention.LAYERNORM_EPS: ("layer_norm_eps", float),
+    }
+
+    # Extract and validate all required fields
+    config_params = {}
+    for gguf_key, (param_name, dtype) in VISION_CONFIG_FIELDS.items():
+        field = reader.get_field(gguf_key)
+        if field is None:
+            logger.warning(
+                "Missing required vision config field '%s' in mmproj.gguf",
+                gguf_key,
+            )
+            return None
+        # Extract scalar value from GGUF field and convert to target type
+        config_params[param_name] = dtype(field.parts[-1])
+
+    # Apply model-specific parameters based on projector type
+    if projector_type == VisionProjectorType.GEMMA3:
+        # Gemma3 doesn't use the vision pooling head (multihead attention)
+        # This is a vLLM-specific parameter used in SiglipVisionTransformer
+        config_params["vision_use_head"] = False
+        logger.info("Detected Gemma3 projector, disabling vision pooling head")
+    # Add other projector-type-specific customizations here as needed
+    # elif projector_type == VisionProjectorType.LLAMA4:
+    #     config_params["vision_use_head"] = ...
+
+    # Create config with extracted parameters
+    # Note: num_channels and attention_dropout use SiglipVisionConfig defaults
+    # (3 and 0.0 respectively) which are correct for all models
+    config = SiglipVisionConfig(**config_params)
+
+    if projector_type:
+        logger.info(
+            "Extracted vision config from mmproj.gguf (projector_type: %s)",
+            projector_type,
+        )
+    else:
+        logger.info("Extracted vision config from mmproj.gguf metadata")
+
+    return config
+
+
+def maybe_patch_hf_config_from_gguf(
+    model: str,
+    hf_config: PretrainedConfig,
+) -> PretrainedConfig:
+    """Patch HF config for GGUF models.
+
+    Applies GGUF-specific patches to HuggingFace config:
+    1. For multimodal models: patches architecture and vision config
+    2. For all GGUF models: overrides vocab_size from embedding tensor
+
+    This ensures compatibility with GGUF models that have extended
+    vocabularies (e.g., Unsloth) where the GGUF file contains more
+    tokens than the HuggingFace tokenizer config specifies.
+
+    Args:
+        model: Model path string
+        hf_config: HuggingFace config to patch in-place
+
+    Returns:
+        Updated HuggingFace config
+    """
+    # Patch multimodal config if mmproj.gguf exists
+    mmproj_path = detect_gguf_multimodal(model)
+    if mmproj_path is not None:
+        vision_config = extract_vision_config_from_gguf(str(mmproj_path))
+
+        # Create HF config for Gemma3 multimodal
+        text_config = hf_config.get_text_config()
+        is_gemma3 = hf_config.model_type in ("gemma3", "gemma3_text")
+        if vision_config is not None and is_gemma3:
+            new_hf_config = Gemma3Config.from_text_vision_configs(
+                text_config=text_config,
+                vision_config=vision_config,
+                architectures=["Gemma3ForConditionalGeneration"],
+            )
+            hf_config = new_hf_config
+
+    return hf_config
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index b3469c1b18f2..8deacb5b0791 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -18,7 +18,7 @@
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 
-from vllm.transformers_utils.utils import convert_model_repo_to_path
+from vllm.transformers_utils.utils import check_gguf_file, convert_model_repo_to_path
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
@@ -236,9 +236,20 @@ def cached_processor_from_config(
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
+    if check_gguf_file(model_config.model):
+        assert not check_gguf_file(model_config.tokenizer), (
+            "For multimodal GGUF models, the original tokenizer "
+            "should be used to correctly load processor."
+        )
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
+    else:
+        model = model_config.model
+        revision = model_config.revision
+
     return cached_get_processor_without_dynamic_kwargs(
-        model_config.model,
-        revision=model_config.revision,
+        model,
+        revision=revision,
         trust_remote_code=model_config.trust_remote_code,
         processor_cls=processor_cls,  # type: ignore[arg-type]
         **_merge_mm_kwargs(model_config, processor_cls, **kwargs),
@@ -339,9 +350,19 @@ def cached_image_processor_from_config(
     model_config: "ModelConfig",
     **kwargs: Any,
 ):
+    if check_gguf_file(model_config.model):
+        assert not check_gguf_file(model_config.tokenizer), (
+            "For multimodal GGUF models, the original tokenizer "
+            "should be used to correctly load image processor."
+        )
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
+    else:
+        model = model_config.model
+        revision = model_config.revision
     return cached_get_image_processor(
-        model_config.model,
-        revision=model_config.revision,
+        model,
+        revision=revision,
         trust_remote_code=model_config.trust_remote_code,
         **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
     )
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 1ae42ba622dc..901a64d9d263 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -27,6 +27,7 @@ def is_cloud_storage(model_or_path: str) -> bool:
     return is_s3(model_or_path) or is_gcs(model_or_path)
 
 
+@cache
 def check_gguf_file(model: str | PathLike) -> bool:
     """Check if the file is a GGUF model."""
     model = Path(model)
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index c8bff8b7c80b..69226763aafe 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -37,12 +37,10 @@
 
 _USAGE_ENV_VARS_TO_COLLECT = [
     "VLLM_USE_MODELSCOPE",
-    "VLLM_USE_TRITON_FLASH_ATTN",
     "VLLM_ATTENTION_BACKEND",
     "VLLM_USE_FLASHINFER_SAMPLER",
     "VLLM_PP_LAYER_PARTITION",
     "VLLM_USE_TRITON_AWQ",
-    "VLLM_USE_V1",
     "VLLM_ENABLE_V1_MULTIPROCESSING",
 ]
 
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index b5a7fea2c357..3ef44e770320 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import inspect
 import uuid
 import warnings
-from functools import wraps
-from typing import Any, TypeVar
+from typing import Any
 
 import torch
 
@@ -41,12 +39,6 @@ def __dir__() -> list[str]:
 
 logger = init_logger(__name__)
 
-# This value is chosen to have a balance between ITL and TTFT. Note it is
-# not optimized for throughput.
-DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
-POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
-MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
-
 # Constants related to forcing the attention backend selection
 
 # String name of register which may be set in order to
@@ -57,62 +49,15 @@ def __dir__() -> list[str]:
 # Possible string values of STR_BACKEND_ENV_VAR
 # register, corresponding to possible backends
 STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
 STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
 
-T = TypeVar("T")
-
-
 def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
-    """
-    A replacement for `abc.ABC`.
-    When we use `abc.ABC`, subclasses will fail to instantiate
-    if they do not implement all abstract methods.
-    Here, we only require `raise NotImplementedError` in the
-    base class, and log a warning if the method is not implemented
-    in the subclass.
-    """
-
-    original_init = cls.__init__
-
-    def find_unimplemented_methods(self: object):
-        unimplemented_methods = []
-        for attr_name in dir(self):
-            # bypass inner method
-            if attr_name.startswith("_"):
-                continue
-
-            try:
-                attr = getattr(self, attr_name)
-                # get the func of callable method
-                if callable(attr):
-                    attr_func = attr.__func__
-            except AttributeError:
-                continue
-            src = inspect.getsource(attr_func)
-            if "NotImplementedError" in src:
-                unimplemented_methods.append(attr_name)
-        if unimplemented_methods:
-            method_names = ",".join(unimplemented_methods)
-            msg = f"Methods {method_names} not implemented in {self}"
-            logger.debug(msg)
-
-    @wraps(original_init)
-    def wrapped_init(self, *args, **kwargs) -> None:
-        original_init(self, *args, **kwargs)
-        find_unimplemented_methods(self)
-
-    type.__setattr__(cls, "__init__", wrapped_init)
-    return cls
-
-
 def length_from_prompt_token_ids_or_embeds(
     prompt_token_ids: list[int] | None,
     prompt_embeds: torch.Tensor | None,
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index a928cce09011..b5ab37534dd7 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -9,6 +9,7 @@
 import importlib
 import os
 from collections.abc import Callable
+from enum import Enum
 from typing import Any, NoReturn
 
 import torch
@@ -20,6 +21,28 @@
 from vllm.utils.math_utils import cdiv
 
 
+class DeepGemmQuantScaleFMT(Enum):
+    # Float32 scales in Float32 tensor
+    FLOAT32 = 0
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Keep the scales in Float32 tensor.
+    FLOAT32_CEIL_UE8M0 = 1
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Pack the scales into a int32 tensor where each int32
+    # element contains 4 scale values.
+    UE8M0 = 2
+
+    @staticmethod
+    def from_oracle() -> "DeepGemmQuantScaleFMT":
+        if not is_deep_gemm_e8m0_used():
+            return DeepGemmQuantScaleFMT.FLOAT32
+        return (
+            DeepGemmQuantScaleFMT.UE8M0
+            if current_platform.is_device_capability(100)
+            else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+        )
+
+
 @functools.cache
 def is_deep_gemm_supported() -> bool:
     """Return `True` if DeepGEMM is supported on the current platform.
@@ -49,10 +72,6 @@ def is_deep_gemm_e8m0_used() -> bool:
         logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
         return False
 
-    if envs.VLLM_USE_FLASHINFER_MOE_FP8:
-        logger.info_once("DeepGEMM E8M0 disabled: FlashInfer MOE is enabled.")
-        return False
-
     if envs.VLLM_USE_DEEP_GEMM_E8M0:
         logger.info_once("DeepGEMM E8M0 enabled on current platform.")
         return True
@@ -77,6 +96,7 @@ def _missing(*_: Any, **__: Any) -> NoReturn:
 _get_paged_mqa_logits_metadata_impl: Callable[..., Any] | None = None
 _get_mn_major_tma_aligned_tensor_impl: Callable[..., Any] | None = None
 _get_mk_alignment_for_contiguous_layout_impl: Callable[..., Any] | None = None
+_transform_sf_into_required_layout_impl: Callable[..., Any] | None = None
 
 
 def _lazy_init() -> None:
@@ -86,6 +106,7 @@ def _lazy_init() -> None:
     global _get_paged_mqa_logits_metadata_impl
     global _get_mn_major_tma_aligned_tensor_impl
     global _get_mk_alignment_for_contiguous_layout_impl
+    global _transform_sf_into_required_layout_impl
     # fast path
     if (
         _fp8_gemm_nt_impl is not None
@@ -95,6 +116,7 @@ def _lazy_init() -> None:
         or _fp8_paged_mqa_logits_impl is not None
         or _get_paged_mqa_logits_metadata_impl is not None
         or _get_mk_alignment_for_contiguous_layout_impl is not None
+        or _transform_sf_into_required_layout_impl is not None
     ):
         return
 
@@ -124,6 +146,9 @@ def _lazy_init() -> None:
     _get_mk_alignment_for_contiguous_layout_impl = getattr(
         _dg, "get_mk_alignment_for_contiguous_layout", None
     )
+    _transform_sf_into_required_layout_impl = getattr(
+        _dg, "transform_sf_into_required_layout", None
+    )
 
 
 def get_num_sms() -> int:
@@ -179,6 +204,15 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     )
 
 
+def transform_sf_into_required_layout(*args, **kwargs):
+    _lazy_init()
+    if _transform_sf_into_required_layout_impl is None:
+        return _missing(*args, **kwargs)
+    return _transform_sf_into_required_layout_impl(
+        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs
+    )
+
+
 def fp8_mqa_logits(
     q: torch.Tensor,
     kv: tuple[torch.Tensor, torch.Tensor],
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5101020fda12..1209d64901bf 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -35,9 +35,20 @@
 )
 
 
+@functools.cache
+def has_flashinfer_cubin() -> bool:
+    """Return `True` if flashinfer-cubin package is available."""
+    if envs.VLLM_HAS_FLASHINFER_CUBIN:
+        return True
+    if importlib.util.find_spec("flashinfer_cubin") is not None:
+        return True
+    logger.debug_once("flashinfer-cubin package was not found")
+    return False
+
+
 @functools.cache
 def has_flashinfer() -> bool:
-    """Return `True` if FlashInfer is available."""
+    """Return `True` if flashinfer-python package is available."""
     # Use find_spec to check if the module exists without importing it
     # This avoids potential CUDA initialization side effects
     if importlib.util.find_spec("flashinfer") is None:
@@ -45,7 +56,7 @@ def has_flashinfer() -> bool:
         return False
     # When not using flashinfer cubin,
     # Also check if nvcc is available since it's required to JIT compile flashinfer
-    if not envs.VLLM_HAS_FLASHINFER_CUBIN and shutil.which("nvcc") is None:
+    if not has_flashinfer_cubin() and shutil.which("nvcc") is None:
         logger.debug_once(
             "FlashInfer unavailable since nvcc was not found "
             "and not using pre-downloaded cubins"
@@ -183,9 +194,8 @@ def has_nvidia_artifactory() -> bool:
     This checks connectivity to the kernel inference library artifactory
     which is required for downloading certain cubin kernels like TRTLLM FHMA.
     """
-    # Since FLASHINFER_CUBIN_DIR defines the pre-downloaded cubins path, when
-    # it's true, we could assume the cubins are available.
-    if envs.VLLM_HAS_FLASHINFER_CUBIN:
+    # If we have pre-downloaded cubins, we can assume the cubins are available.
+    if has_flashinfer_cubin():
         return True
 
     try:
@@ -208,9 +218,13 @@ def has_nvidia_artifactory() -> bool:
 @functools.cache
 def supports_trtllm_attention() -> bool:
     """
-    TRTLLM attention is supported if the platform is SM100 and
-    NVIDIA artifactory is accessible
+    TRTLLM attention is supported if the platform is SM100,
+    NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
     """
+    # Batch-invariant mode disables TRTLLM attention
+    if vllm_is_batch_invariant():
+        return False
+
     # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
     return current_platform.is_device_capability(100) and has_nvidia_artifactory()
 
@@ -229,9 +243,6 @@ def force_use_trtllm_attention() -> bool | None:
     return `True` if TRTLLM attention is forced to be used,
     return `False` if TRTLLM attention is forced to be not used.
     """
-    if vllm_is_batch_invariant():
-        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is disabled for batch-invariant")
-        return False
     return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
 
 
@@ -248,6 +259,7 @@ def use_trtllm_attention(
     num_kv_heads: int,
     num_tokens: int,
     max_seq_len: int,
+    dcp_world_size: int,
     kv_cache_dtype: str,
     q_dtype: torch.dtype,
     is_prefill: bool,
@@ -261,6 +273,14 @@ def use_trtllm_attention(
     if force_use_trtllm is not None and not force_use_trtllm:
         return False
 
+    # Decode context parallel is not supported
+    if dcp_world_size > 1:
+        logger.warning_once(
+            "Trtllm does not support returning LSE and as a result "
+            "does not support DCP, reverting to FlashInfer"
+        )
+        return False
+
     # The platform is not supported
     if not supports_trtllm_attention():
         if force_use_trtllm:
@@ -299,14 +319,12 @@ def use_trtllm_attention(
         # Environment variable not set - use auto-detection
         if is_prefill:
             # Prefill auto-detection
-            use_trtllm = max_seq_len <= 131072 and kv_cache_dtype == "auto"
+            use_trtllm = kv_cache_dtype == "auto"
             if use_trtllm:
                 logger.warning_once("Using TRTLLM prefill attention (auto-detected).")
         else:
             # Decode auto-detection
-            use_trtllm = (
-                num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto"
-            )
+            use_trtllm = num_tokens <= 256 and kv_cache_dtype == "auto"
             if use_trtllm:
                 logger.warning_once("Using TRTLLM decode attention (auto-detected).")
         return use_trtllm
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index 4dd85ef26f34..160ac9ac263a 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -89,6 +89,21 @@ def handle(self, phase: str, info: dict[str, int]) -> None:
             )
 
 
+def freeze_gc_heap() -> None:
+    """
+    Freeze all objects tracked by the garbage collector. It should be invoked
+    after server init / warmup, to reduce GC overhead from static objects
+    during serving time.
+    """
+    # Ensure all static objects are pushed down to the oldest generation for
+    # freeze
+    gc.collect(0)
+    gc.collect(1)
+    gc.collect(2)
+    # Freeze all GC tracked objects
+    gc.freeze()
+
+
 def maybe_attach_gc_debug_callback() -> None:
     """
     Attached a callback for GC debug when VLLM_GC_DEBUG is enabled.
diff --git a/vllm/utils/platform_utils.py b/vllm/utils/platform_utils.py
index 34ac820c6e9d..433c6734e8a9 100644
--- a/vllm/utils/platform_utils.py
+++ b/vllm/utils/platform_utils.py
@@ -24,6 +24,11 @@ def xpu_is_initialized() -> bool:
     return torch.xpu.is_initialized()
 
 
+def get_cu_count(device_id: int = 0) -> int:
+    """Returns the total number of compute units (CU) on single GPU."""
+    return torch.cuda.get_device_properties(device_id).multi_processor_count
+
+
 def cuda_get_device_properties(
     device, names: Sequence[str], init_cuda=False
 ) -> tuple[Any, ...]:
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index fd5c1b73f191..7c094e14cff7 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -409,6 +409,30 @@ def current_stream() -> torch.cuda.Stream:
     return _current_stream_tls.value
 
 
+# Global auxilary stream for running operations in background streams.
+# We have single global auxilary stream to avoid an explosion of streams
+# for every layer (and make profiling look sane).
+#
+# aux_stream() is currently used for:
+#   - MoE shared_expert overlap with router
+_aux_stream: torch.cuda.Stream | None = None
+
+
+def aux_stream() -> torch.cuda.Stream | None:
+    """
+    Ensures aux_stream is initialized only once
+    """
+    global _aux_stream
+
+    from vllm.platforms import current_platform
+
+    # TODO: validate this works properly on ROCm platform.
+    if _aux_stream is None and current_platform.is_cuda():
+        _aux_stream = torch.cuda.Stream()
+
+    return _aux_stream
+
+
 @lru_cache(maxsize=8)
 def _cuda_device_count_stateless(cuda_visible_devices: str | None = None) -> int:
     # Note: cuda_visible_devices is not used, but we keep it as an argument for
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 20d987fa2de3..f1254352c058 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -1,22 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar
 
-import numpy as np
 import torch
-from torch.nn.functional import scaled_dot_product_attention
 
+from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
     AttentionLayer,
-    AttentionMetadata,
     AttentionType,
     is_quantized_kv_cache,
 )
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.platforms import CpuArchEnum, current_platform
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
@@ -24,51 +23,49 @@
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-try:
-    import intel_extension_for_pytorch.llm.modules as ipex_modules
-
-    _use_ipex = True
-# AttributeError is to handle a bug in ipex
-# https://github.com/intel/intel-extension-for-pytorch/pull/813
-except (ImportError, AttributeError):
-    _use_ipex = False
-
-from vllm import _custom_ops as ops
-
 logger = init_logger(__name__)
 
+_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86,)
+
 
-class TorchSDPABackend(AttentionBackend):
-    accept_output_buffer: bool = False
+class CPUAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
 
     @classmethod
     def get_supported_dtypes(cls) -> list[torch.dtype]:
         return [torch.float16, torch.bfloat16, torch.float32]
 
     @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        attn_impl = _get_paged_attn_impl()
-        is_valid, supported_head_sizes = attn_impl.validate_head_size(head_size)
-        if not is_valid:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
 
     @staticmethod
     def get_name() -> str:
-        return "TORCH_SDPA"
+        return "CPU_ATTN"
+
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """CPU attention supports decoder and encoder-only attention."""
+        from vllm.attention import AttentionType
+
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+        )
 
     @staticmethod
-    def get_impl_cls() -> type["TorchSDPABackendImpl"]:
-        return TorchSDPABackendImpl
+    def get_impl_cls() -> type["CPUAttentionBackendImpl"]:
+        return CPUAttentionBackendImpl
 
     @staticmethod
-    def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]:
-        return TorchSDPAMetadataBuilderV1
+    def get_builder_cls() -> type["CPUAttentionMetadataBuilder"]:
+        return CPUAttentionMetadataBuilder
 
     @staticmethod
     def get_kv_cache_shape(
@@ -78,9 +75,7 @@ def get_kv_cache_shape(
         head_size: int,
         cache_dtype_str: str = "auto",
     ) -> tuple[int, ...]:
-        return _get_paged_attn_impl().get_kv_cache_shape(
-            num_blocks, block_size, num_kv_heads, head_size
-        )
+        return 2, num_blocks, num_kv_heads, block_size, head_size
 
     @staticmethod
     def use_cascade_attention(*args, **kwargs) -> bool:
@@ -88,264 +83,26 @@ def use_cascade_attention(*args, **kwargs) -> bool:
 
 
 @dataclass
-class TorchSDPAMetadata(AttentionMetadata):
-    """Attention metadata for prefill and decode batched together."""
-
-    # Total number of prefill requests.
-    num_prefills: int
-    # Number of prefill tokens.
-    num_prefill_tokens: int
-    # Number of decode tokens. Note that it is equivalent to the number of
-    # decode requests.
-    num_decode_tokens: int
-    # (num_tokens,). The indices of the token slots that input tokens will be
-    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
-    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
-    # in block 0, and 1st slot in block 1, respectively.
+class CPUAttentionMetadata:
+    isa: str
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
     slot_mapping: torch.Tensor
-    """Metadata for PagedAttention."""
-    # (batch_size,). The length of sequences (entire tokens seen so far) per
-    # sequence.
-    decode_seq_lens_tensor: torch.Tensor | None
-    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
-    decode_max_seq_len: int
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    decode_block_tables: torch.Tensor | None
-    """Metadata for TorchSDPABackend.
-    """
-    # Currently, input sequences can only contain all prompts
-    # or all decoding. True if all sequences are prompts.
-    chunked_prefill: bool
-    seq_lens: list[int] | None = None  # For non-chunked prefill
-
-    # For chunked prefill only
-    max_query_len: int | None = None
-    prefill_max_seq_len: int | None = None
-    prefill_query_start_loc: torch.Tensor | None = None
-    prefill_seq_start_loc: torch.Tensor | None = None
-    prefill_block_tables: torch.Tensor | None = None
-
-    # For V1 logits index only
-    query_start_loc: torch.Tensor | None = None
-
-    # Begin encoder attn & enc/dec cross-attn fields...
-    # Encoder sequence lengths representation
-    encoder_seq_lens: list[int] | None = None
-    encoder_seq_lens_tensor: torch.Tensor | None = None
-
-    # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: int | None = None
-
-    # Number of tokens input to encoder
-    num_encoder_tokens: int | None = None
-
-    # Cross-attention memory-mapping data structures: slot mapping
-    # and block tables
-    cross_slot_mapping: torch.Tensor | None = None
-    cross_block_tables: torch.Tensor | None = None
-
-    def __post_init__(self):
-        # Set during the execution of the first attention op.
-        # It is a list because it is needed to set per prompt
-        # when alibi slopes is used. It is because of the limitation
-        # from xformer API.
-        # will not appear in the __repr__ and __init__
-        self.attn_bias: list[torch.Tensor] | None = None
-        self.encoder_attn_bias: list[torch.Tensor] | None = None
-        self.cross_attn_bias: list[torch.Tensor] | None = None
-
-    @property
-    def is_all_encoder_attn_metadata_set(self):
-        """
-        All attention metadata required for encoder attention is set.
-        """
-        return (
-            (self.encoder_seq_lens is not None)
-            and (self.encoder_seq_lens_tensor is not None)
-            and (self.max_encoder_seq_len is not None)
-        )
-
-    @property
-    def is_all_cross_attn_metadata_set(self):
-        """
-        All attention metadata required for enc/dec cross-attention is set.
-
-        Superset of encoder attention required metadata.
-        """
-        return (
-            self.is_all_encoder_attn_metadata_set
-            and (self.cross_slot_mapping is not None)
-            and (self.cross_block_tables is not None)
-        )
-
-    @property
-    def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_prefill_tokens == 0:
-            return None
-        return self
-
-    @property
-    def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        return self
-
-    def get_seq_lens(
-        self,
-        attn_type: str,
-    ):
-        """
-        Extract appropriate sequence lengths from attention metadata
-        according to attention type.
+    scheduler_metadata: torch.Tensor | None
+    causal: bool = True
 
-        Arguments:
+    # can be removed after deprecate sdpa
+    use_sdpa_prefill: bool = False
+    num_decode_tokens: int = 0
+    sdpa_attn_masks: list[torch.Tensor | None] | None = None
+    sdpa_start_loc: torch.Tensor | None = None
 
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-        * Appropriate sequence lengths tensor for query
-        * Appropriate sequence lengths tensor for key & value
-        """
-
-        if (
-            attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY
-        ):
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.seq_lens
-        elif attn_type == AttentionType.ENCODER:
-            seq_lens_q = self.encoder_seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-        return seq_lens_q, seq_lens_kv
-
-    def get_attn_bias(
-        self,
-        attn_type: str,
-    ) -> list[torch.Tensor] | None:
-        """
-        Extract appropriate attention bias from attention metadata
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-        * Appropriate attention bias value given the attention type
-        """
-
-        if (
-            attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY
-        ):
-            return self.attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            return self.encoder_attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            return self.cross_attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-    def set_attn_bias(
-        self,
-        attn_bias: list[torch.Tensor],
-        attn_type: str,
-    ) -> None:
-        """
-        Update appropriate attention bias field of attention metadata,
-        according to attention type.
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_bias: The desired attention bias value
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-        """
-
-        if (
-            attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY
-        ):
-            self.attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            self.encoder_attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            self.cross_attn_bias = attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-    def get_seq_len_block_table_args(
-        self,
-        attn_type: str,
-    ) -> tuple:
-        """
-        The particular choice of sequence-length- and block-table-related
-        attributes which should be extracted from attn_metadata is dependent
-        on the type of attention operation.
-
-        Decoder attn -> select entirely decoder self-attention-related fields
-        Encoder/decoder cross-attn -> select encoder sequence lengths &
-                                    cross-attn block-tables fields
-        Encoder attn -> select encoder sequence lengths fields & no block tables
-
-        Arguments:
-
-        * attn_metadata: Attention metadata structure associated with attention
-        * is_prompt: True if prefill, False otherwise
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-
-        Returns:
-
-        * Appropriate sequence-lengths tensor
-        * Appropriate max sequence-length scalar
-        * Appropriate block tables (or None)
-        """
-
-        if (
-            attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY
-        ):
-            # Decoder self-attention
-            # Choose max_seq_len based on whether we are in prompt_run
-            return (
-                self.decode_seq_lens_tensor,
-                self.decode_max_seq_len,
-                self.decode_block_tables,
-            )
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            # Enc/dec cross-attention KVs match encoder sequence length;
-            # cross-attention utilizes special "cross" block tables
-            return (
-                self.encoder_seq_lens_tensor,
-                self.max_encoder_seq_len,
-                self.cross_block_tables,
-            )
-        elif attn_type == AttentionType.ENCODER:
-            # No block tables associated with encoder attention
-            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len, None)
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-
-class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
-    reorder_batch_threshold: int = 1
 
+class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]):
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,
@@ -355,80 +112,104 @@ def __init__(
     ) -> None:
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
-        self.scheduler_config = vllm_config.scheduler_config
-        self._init_reorder_batch_threshold(1, False)
+        self.use_sdpa_prefill = False
+        reorder_batch_threshold = None
+        if current_platform.get_cpu_architecture() not in _CPU_ARCH_PREFER_MIXED_BATCH:
+            # in this case, decode seqs are reordered to the front of prefill seqs
+            # to split decode and prefill. Then use SDPA for prefill and
+            # cpu_attention_with_kv_cache for decode
+            reorder_batch_threshold = 1
+            self.use_sdpa_prefill = True
+
+        self._init_reorder_batch_threshold(reorder_batch_threshold, False)
+
+        self.kv_cache_spec = kv_cache_spec
+        self.vllm_config = vllm_config
 
-        self.seq_start_loc_cpu = torch.zeros(
-            vllm_config.scheduler_config.max_num_seqs + 1,
-            dtype=torch.int32,
-            device="cpu",
+        parallel_config = vllm_config.parallel_config
+        self.num_kv_heads = vllm_config.model_config.get_num_kv_heads(parallel_config)
+        self.num_heads = vllm_config.model_config.get_num_attention_heads(
+            parallel_config
         )
-        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+        self.head_dim = kv_cache_spec.head_size
+        self.dtype = vllm_config.model_config.dtype
+        self.window_size = getattr(kv_cache_spec, "sliding_window", -1)
+        if self.window_size is None:
+            self.window_size = -1
+        self.block_size = vllm_config.cache_config.block_size
+        self.isa = _get_attn_isa(self.dtype, self.block_size)
 
     def build(
         self,
         common_prefix_len: int,
         common_attn_metadata: CommonAttentionMetadata,
         fast_build: bool = False,
-    ) -> TorchSDPAMetadata:
+    ) -> CPUAttentionMetadata:
         num_reqs = common_attn_metadata.num_reqs
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
-
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
-        seq_lens_np = seq_lens_cpu.numpy()
-
-        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-        query_start_loc_np = query_start_loc_cpu.numpy()
-
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold,
-                require_uniform=True,
-            )
-        )
-
-        max_prefill_seq_len = (
-            seq_lens_np[num_decodes:num_reqs].max().item() if num_prefills > 0 else 0
-        )
-        max_decode_seq_len = (
-            seq_lens_np[:num_decodes].max().item() if num_prefills < num_reqs else 0
-        )
-        self.seq_start_loc_np[0] = 0
-        np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1 : num_reqs + 1])
-
-        slot_mapping = common_attn_metadata.slot_mapping.long()
+        max_seq_len = common_attn_metadata.max_seq_len
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
-        query_start_loc_np = query_start_loc_cpu.numpy()
-        query_start_loc_np[num_decodes : num_reqs + 1] -= num_decode_tokens
+        slot_mapping = common_attn_metadata.slot_mapping
+        causal = common_attn_metadata.causal
+
+        sdpa_start_loc = query_start_loc
+        num_decode_tokens = 0
+        if self.use_sdpa_prefill and causal:
+            # Decoder, need reorder and truncate
+            assert self.reorder_batch_threshold
+            (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens) = (
+                split_decodes_and_prefills(
+                    common_attn_metadata,
+                    decode_threshold=self.reorder_batch_threshold,
+                    require_uniform=True,
+                )
+            )
+            num_reqs = num_decodes
+            sdpa_start_loc = sdpa_start_loc[num_decodes:] - num_decode_tokens
+            seq_lens = seq_lens[:num_decodes]
+            query_start_loc = query_start_loc[: num_decodes + 1]
+            block_table_tensor = block_table_tensor[:num_decodes]
+
+        sheduler_metadata = None
+        if causal:
+            # for decode batch, use the custom kernel
+            sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
+                num_reqs=num_reqs,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                head_dim=self.head_dim,
+                seq_lens=seq_lens,
+                dtype=self.dtype,
+                query_start_loc=query_start_loc,
+                causal=causal,
+                sliding_window_size=self.window_size,
+                isa=self.isa,
+                enable_kv_split=True,
+            )
 
-        attn_metadata = TorchSDPAMetadata(
-            num_prefills=num_prefills,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            slot_mapping=slot_mapping,
-            # to ensure inference when chunked_prefill is disabled
-            seq_lens=seq_lens_cpu.tolist()[num_decodes:],  # prefill
-            decode_seq_lens_tensor=seq_lens_cpu[:num_decodes],  # decode
-            decode_max_seq_len=max_decode_seq_len,  # decode
-            decode_block_tables=block_table_tensor[:num_decodes],  # decode
-            chunked_prefill=self.scheduler_config.chunked_prefill_enabled,
+        attn_metadata = CPUAttentionMetadata(
+            isa=self.isa,
+            num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
-            prefill_max_seq_len=max_prefill_seq_len,
-            prefill_query_start_loc=query_start_loc_cpu[
-                num_decodes : num_reqs + 1
-            ],  # prefill
-            prefill_seq_start_loc=self.seq_start_loc_cpu[
-                num_decodes : num_reqs + 1
-            ],  # prefill
-            prefill_block_tables=block_table_tensor[num_decodes:num_reqs],  # prefill
-            query_start_loc=query_start_loc_cpu[: num_reqs + 1],  # for logits index
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            scheduler_metadata=sheduler_metadata,
+            causal=causal,
+            use_sdpa_prefill=self.use_sdpa_prefill,
+            num_decode_tokens=num_decode_tokens,
+            sdpa_start_loc=sdpa_start_loc,
         )
 
         return attn_metadata
 
 
-class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
+class CPUAttentionBackendImpl(AttentionImpl):
     def __init__(
         self,
         num_heads: int,
@@ -441,37 +222,48 @@ def __init__(
         logits_soft_cap: float | None = None,
         attn_type: str = AttentionType.DECODER,
         kv_sharing_target_layer_name: str | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if logits_soft_cap is not None:
-            logger.warning_once(
-                "Torch SPDA does not support logits soft cap. "
-                "Outputs may be slightly off."
-            )
-        self.paged_attn_impl = _get_paged_attn_impl()
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
+        if logits_soft_cap is not None and attn_type in (
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+        ):
+            logger.warning_once(
+                "CPU_ATTN does not support logits softcap for"
+                " ENCODER and ENCODER_ONLY, outputs may be slightly off"
+            )
+        if logits_soft_cap is None:
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+
         self.num_kv_heads = num_kv_heads
         if alibi_slopes is not None:
             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
         self.alibi_slopes = alibi_slopes
-        self.sliding_window = sliding_window
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        elif attn_type == AttentionType.ENCODER_ONLY:
+            self.sliding_window = (sliding_window - 1, sliding_window - 1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
-
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.need_mask = (
-            self.alibi_slopes is not None or self.sliding_window is not None
-        )
 
-        if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex:
-            raise NotImplementedError(
-                "Torch SDPA backend FP8 KV cache requires "
-                "intel_extension_for_pytorch support."
-            )
+        if is_quantized_kv_cache(kv_cache_dtype):
+            raise NotImplementedError("FP8 KV cache is unsupported in CPU_ATTN")
         self.attn_type = attn_type
 
+        self.sinks = sinks
+        if self.sinks is not None:
+            assert self.sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads as the number of "
+                "heads in the layer"
+            )
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -479,196 +271,130 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,  # type: ignore
+        attn_metadata: CPUAttentionMetadata | None,
         output: torch.Tensor | None = None,
         output_scale: torch.Tensor | None = None,
         output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        """Forward pass with torch SDPA and PagedAttention.
+        """Forward pass for CPU attention backend.
 
         Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
             kv_cache: shape =
-                [2, num_blocks, block_size * num_kv_heads * head_size]
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
+                [2, num_blocks, num_kv_heads, block_size, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        assert output is not None, "Output tensor must be provided."
         if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
-                " for TorchSDPABackendImpl"
+                " for CPUAttentionBackendImpl"
             )
 
         # For warming-up
         if attn_metadata is None:
-            return query
-
-        attn_type = self.attn_type
-        if attn_type == AttentionType.ENCODER and (
-            not attn_metadata.is_all_encoder_attn_metadata_set
-        ):
-            raise AttributeError(
-                "Encoder attention requires setting encoder metadata attributes."
-            )
-        elif attn_type == AttentionType.ENCODER_DECODER and (
-            not attn_metadata.is_all_cross_attn_metadata_set
-        ):
-            raise AttributeError(
-                "Encoder/decoder cross-attention "
-                "requires setting cross-attention "
-                "metadata attributes."
+            return output
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        # Handle encoder attention differently - no KV cache needed
+        if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
+            # For encoder attention,
+            return self._run_sdpa_forward(
+                query[:num_actual_tokens],
+                key[:num_actual_tokens],
+                value[:num_actual_tokens],
+                output[:num_actual_tokens],
+                attn_metadata,
+                self.attn_type,
             )
 
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, self.num_heads, self.head_size)
-        if key is not None:
-            assert value is not None
-            key = key.view(-1, self.num_kv_heads, self.head_size)
-            value = value.view(-1, self.num_kv_heads, self.head_size)
-        else:
-            assert value is None
-
-        if attn_type != AttentionType.ENCODER and kv_cache.numel() > 0:
-            # KV-cache during decoder-self- or
-            # encoder-decoder-cross-attention, but not
-            # during encoder attention.
-            #
-            # Even if there are no new key/value pairs to cache,
-            # we still need to break out key_cache and value_cache
-            # i.e. for later use by paged attention
-            key_cache, value_cache = self.paged_attn_impl.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size
-            )
+        # For decoder and cross-attention, use KV cache, size are
+        # [num_blocks, num_kv_heads, block_size, head_size]
+        key_cache, value_cache = kv_cache.unbind(0)
 
-            if (key is not None) and (value is not None):
-                if attn_type == AttentionType.ENCODER_DECODER:
-                    # Update cross-attention KV cache (prefill-only)
-                    # During cross-attention decode, key & value will be None,
-                    # preventing this IF-statement branch from running
-                    updated_slot_mapping = attn_metadata.cross_slot_mapping
-                else:
-                    # Update self-attention KV cache (prefill/decode)
-                    updated_slot_mapping = attn_metadata.slot_mapping
-
-                self.paged_attn_impl.write_to_paged_cache(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    updated_slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
+            ops.cpu_attn_reshape_and_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                attn_metadata.isa,
+            )
 
-        if attn_type != AttentionType.ENCODER:
-            # Decoder self-attention supports chunked prefill.
-            # Encoder/decoder cross-attention requires no chunked
-            # prefill (100% prefill or 100% decode tokens, no mix)
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
+        if attn_metadata.use_sdpa_prefill:
+            assert self.sinks is None, "Attention sink is unsupported in SDPA prefill"
             num_decode_tokens = attn_metadata.num_decode_tokens
-        else:
-            # Encoder attention - chunked prefill is not applicable;
-            # derive token-count from query shape & and treat them
-            # as 100% prefill tokens
-            assert attn_metadata.num_encoder_tokens is not None
-            num_prefill_tokens = attn_metadata.num_encoder_tokens
-            num_decode_tokens = 0
-
-        if attn_type == AttentionType.DECODER:
-            # Only enforce this shape-constraint for decoder
-            # self-attention
-            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
-
-        output = torch.empty_like(query)
-        if prefill_meta := attn_metadata.prefill_metadata:
-            if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
-                assert attn_metadata.seq_lens is not None
-                self._run_sdpa_forward(
-                    output, query, key, value, prefill_meta, attn_type=attn_type
-                )
-            else:
-                # prefix-enabled attention
-                assert not self.need_mask
-                import intel_extension_for_pytorch.llm.modules as ipex_modules
-
-                output = torch.empty_like(query)
-                ipex_modules.PagedAttention.flash_attn_varlen_func(
-                    output[prefill_meta.num_decode_tokens :, :, :],
-                    query[prefill_meta.num_decode_tokens :, :, :],
-                    key_cache,
-                    value_cache,
-                    prefill_meta.prefill_query_start_loc,
-                    prefill_meta.prefill_seq_start_loc,
-                    prefill_meta.max_query_len,
-                    prefill_meta.prefill_max_seq_len,
-                    self.scale,
-                    True,
-                    prefill_meta.prefill_block_tables,
-                    self.alibi_slopes,
-                )
-        if decode_meta := attn_metadata.decode_metadata:
-            assert attn_type != AttentionType.ENCODER_ONLY, (
-                "Encoder-only models should not have decode metadata."
+            self._run_sdpa_forward(
+                query[num_decode_tokens:num_actual_tokens],
+                key[num_decode_tokens:num_actual_tokens],
+                value[num_decode_tokens:num_actual_tokens],
+                output[num_decode_tokens:num_actual_tokens],
+                attn_metadata,
+                self.attn_type,
             )
-            # Decoding run.
-            (
-                seq_lens_arg,
-                max_seq_len_arg,
-                block_tables_arg,
-            ) = decode_meta.get_seq_len_block_table_args(attn_type)
-
-            self.paged_attn_impl.forward_decode(
-                output[: attn_metadata.num_decode_tokens, :, :],
-                query[: attn_metadata.num_decode_tokens, :, :],
-                key_cache,
-                value_cache,
-                block_tables_arg,
-                seq_lens_arg,
-                max_seq_len_arg,
-                self.kv_cache_dtype,
-                self.num_kv_heads,
-                self.scale,
-                self.alibi_slopes,
-                layer._k_scale,
-                layer._v_scale,
+            num_actual_tokens = num_decode_tokens
+
+        if num_actual_tokens > 0:
+            ops.cpu_attention_with_kv_cache(
+                query=query[:num_actual_tokens],
+                key_cache=key_cache,
+                value_cache=value_cache,
+                output=output[:num_actual_tokens],  # type: ignore
+                query_start_loc=attn_metadata.query_start_loc,
+                seq_lens=attn_metadata.seq_lens,
+                scale=self.scale,
+                causal=attn_metadata.causal,
+                alibi_slopes=self.alibi_slopes,  # type: ignore
+                sliding_window=self.sliding_window,
+                block_table=attn_metadata.block_table,
+                softcap=self.logits_soft_cap,
+                scheduler_metadata=attn_metadata.scheduler_metadata,
+                s_aux=self.sinks,
             )
 
-        # Reshape the output tensor.
-        return output.view(-1, self.num_heads * self.head_size)
+        return output
 
     def _run_sdpa_forward(
         self,
-        output: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,
-        attn_type: str = AttentionType.DECODER,
-    ) -> None:
-        attn_masks = attn_metadata.get_attn_bias(attn_type)
+        output: torch.Tensor,
+        attn_metadata: CPUAttentionMetadata,
+        attn_type: str,
+    ) -> torch.Tensor:
+        attn_masks = attn_metadata.sdpa_attn_masks
         if attn_masks is None:
             if self.alibi_slopes is not None:
                 attn_masks = _make_alibi_bias(
                     self.alibi_slopes,
                     query.dtype,
-                    attn_metadata.seq_lens,  # type: ignore
+                    attn_metadata.sdpa_start_loc,
                 )
-            elif self.sliding_window is not None:
+            elif self.sliding_window[0] != -1 or self.sliding_window[1] != -1:
                 assert attn_metadata.seq_lens is not None
                 attn_masks = _make_sliding_window_bias(
-                    attn_metadata.seq_lens, self.sliding_window, query.dtype
+                    attn_metadata.sdpa_start_loc,
+                    self.sliding_window[0],
+                    self.sliding_window[1],
+                    query.dtype,
                 )
             else:
-                seq_lens, _ = attn_metadata.get_seq_lens(attn_type)
-                attn_masks = [None] * len(seq_lens)
-            attn_metadata.set_attn_bias(attn_masks, attn_type)
+                attn_masks = [None] * (attn_metadata.sdpa_start_loc.size(0) - 1)  # type: ignore
+            attn_metadata.sdpa_attn_masks = attn_masks
 
         query = query.movedim(0, query.dim() - 2)
         key = key.movedim(0, key.dim() - 2)
@@ -680,21 +406,16 @@ def _run_sdpa_forward(
 
         causal_attn = attn_type == AttentionType.DECODER
 
-        seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type)
-        # Incoming Q and KV contain decoded tokens as well, hence start at an offset
-        # equal to num_decode_tokens since decode requests appear first
-        start_q, start_kv = (
-            attn_metadata.num_decode_tokens,
-            attn_metadata.num_decode_tokens,
-        )
-        for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv, attn_masks):
-            end_q = start_q + seq_len_q
-            end_kv = start_kv + seq_len_kv
+        sdpa_start_loc = attn_metadata.sdpa_start_loc.numpy()  # type: ignore
+        for i in range(len(attn_masks)):
+            mask = attn_masks[i]
+            start_q = sdpa_start_loc[i]
+            end_q = sdpa_start_loc[i + 1]
             sub_out = (
-                scaled_dot_product_attention(
+                torch.nn.functional.scaled_dot_product_attention(
                     query[None, :, start_q:end_q, :],
-                    key[None, :, start_kv:end_kv, :],
-                    value[None, :, start_kv:end_kv, :],
+                    key[None, :, start_q:end_q, :],
+                    value[None, :, start_q:end_q, :],
                     attn_mask=mask,
                     dropout_p=0.0,
                     is_causal=causal_attn and mask is None,
@@ -704,17 +425,20 @@ def _run_sdpa_forward(
                 .movedim(query.dim() - 2, 0)
             )
             output[start_q:end_q, :, :] = sub_out
-            start_q, start_kv = end_q, end_kv
+        return output
 
 
 def _make_alibi_bias(
     alibi_slopes: torch.Tensor,
     dtype: torch.dtype,
-    seq_lens: list[int],
+    sdpa_start_loc: torch.Tensor,
 ) -> list[torch.Tensor]:
     attn_biases: list[torch.Tensor] = []
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype)
+    seq_num = sdpa_start_loc.size(0) - 1
+    sdpa_start_loc = sdpa_start_loc.numpy()  # type: ignore
+    for i in range(seq_num):
+        seq_len = sdpa_start_loc[i + 1] - sdpa_start_loc[i]
+        bias = torch.arange(seq_len, dtype=dtype)  # type: ignore
         # NOTE(zhuohan): HF uses
         #     `bias = bias[None, :].repeat(seq_len, 1)`
         # here. We find that both biases give the same results, but
@@ -726,7 +450,7 @@ def _make_alibi_bias(
         bias = bias[None, :].repeat((num_heads, 1, 1))
         bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
         inf_mask = (
-            torch.empty((1, seq_len, seq_len), dtype=bias.dtype)
+            torch.empty((1, seq_len, seq_len), dtype=bias.dtype)  # type: ignore
             .fill_(-torch.inf)
             .triu_(diagonal=1)
         )
@@ -736,211 +460,37 @@ def _make_alibi_bias(
 
 
 def _make_sliding_window_bias(
-    seq_lens: list[int],
-    window_size: int | None,
+    sdpa_start_loc: torch.Tensor,
+    left_window_size: int,
+    right_window_size: int,
     dtype: torch.dtype,
 ) -> list[torch.Tensor]:
     attn_biases: list[torch.Tensor] = []
-    for seq_len in seq_lens:
-        tensor = torch.full(
-            (1, seq_len, seq_len),
-            dtype=dtype,
+    seq_num = sdpa_start_loc.size(0) - 1
+    sdpa_start_loc = sdpa_start_loc.numpy()  # type: ignore
+    for i in range(seq_num):
+        seq_len = sdpa_start_loc[i + 1] - sdpa_start_loc[i]
+        mask = torch.full(  # type: ignore
+            (1, seq_len, seq_len),  # type: ignore
             fill_value=1,
+            dtype=dtype,
         )
-        shift = 0
-        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
-        if window_size is not None:
-            mask = torch.triu(mask, diagonal=shift - window_size + 1)
+
+        if right_window_size != -1:
+            mask = torch.tril(mask, diagonal=right_window_size)
+        if left_window_size != -1:
+            mask = torch.triu(mask, diagonal=-left_window_size)
         mask = torch.log(mask)
-        attn_biases.append(mask.to(dtype))
+        attn_biases.append(mask)
 
     return attn_biases
 
 
-class _PagedAttention:
-    @staticmethod
-    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
-        SUPPORT_HS = [32, 64, 80, 96, 112, 128, 192, 256]
-        return head_size in SUPPORT_HS, SUPPORT_HS
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> tuple[int, ...]:
-        return 2, num_blocks, block_size * num_kv_heads * head_size
-
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        x = 16 // kv_cache.element_size()
-        num_blocks = kv_cache.shape[1]
-
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, -1, x)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
-        return key_cache, value_cache
-
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ops.reshape_and_cache(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            slot_mapping.flatten(),
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        tp_rank: int = 0
-        blocksparse_local_blocks: int = 0
-        blocksparse_vert_stride: int = 0
-        blocksparse_block_size: int = 64
-        blocksparse_head_sliding_step: int = 0
-        block_size = value_cache.shape[3]
-
-        ops.paged_attention_v1(
-            output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            context_lens,
-            block_size,
-            max_context_len,
-            alibi_slopes,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-            tp_rank,
-            blocksparse_local_blocks,
-            blocksparse_vert_stride,
-            blocksparse_block_size,
-            blocksparse_head_sliding_step,
-        )
-
-
-class _IPEXPagedAttention(_PagedAttention):
-    @staticmethod
-    def validate_head_size(head_size: int) -> tuple[bool, list[int]]:
-        return True, []
-
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        num_blocks = kv_cache.shape[1]
-
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        return key_cache, value_cache
-
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ipex_modules.PagedAttention.reshape_and_cache(
-            key, value, key_cache, value_cache, slot_mapping.flatten().int()
-        )
-
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        block_size = value_cache.shape[2]
-        head_mapping = (
-            torch.arange(
-                0,
-                num_kv_heads,
-                device="cpu",
-                dtype=torch.int32,
-            )
-            .view(num_kv_heads, 1)
-            .repeat_interleave(query.size(1) // num_kv_heads)
-            .flatten()
-        )
-        ipex_modules.PagedAttention.single_query_cached_kv_attention(
-            output,
-            query.contiguous(),
-            key_cache,
-            value_cache,
-            head_mapping,
-            scale,
-            block_tables,
-            context_lens,
-            block_size,
-            max_context_len,
-            alibi_slopes,
-        )
-
-
-def _get_paged_attn_impl():
-    if _use_ipex:
-        return _IPEXPagedAttention
+def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:
+    supports_amx = torch._C._cpu._is_amx_tile_supported()
+    if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
+        return "amx"
+    elif block_size % 32 == 0:
+        return "vec"
     else:
-        return _PagedAttention
+        return "vec16"
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 15bb2f4a40ac..a5d4435000d4 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -3,6 +3,7 @@
 """Attention layer with FlashAttention."""
 
 from dataclasses import dataclass
+from typing import ClassVar
 
 import numpy as np
 import torch
@@ -32,11 +33,13 @@
         reshape_and_cache_flash,
     )
 from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
+from vllm.platforms.interface import DeviceCapability
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
@@ -52,39 +55,29 @@
 
 class FlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
-
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        # NOTE(tdoublep): while in principle, FA supports
-        # MultipleOf(16), these are the block sizes that do not
-        # suffer from the NaN propagation problem described here:
-        # https://github.com/Dao-AILab/flash-attention/issues/1974
-        return [16, 32, 64]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    # NOTE(tdoublep): while in principle, FA supports
+    # MultipleOf(16), these are the block sizes that do not
+    # suffer from the NaN propagation problem described here:
+    # https://github.com/Dao-AILab/flash-attention/issues/1974
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
 
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """FlashAttention supports all attention types."""
+        from vllm.attention import AttentionType
+
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+            AttentionType.ENCODER_DECODER,
+        )
+
     @staticmethod
     def get_impl_cls() -> type["FlashAttentionImpl"]:
         return FlashAttentionImpl
@@ -125,6 +118,44 @@ def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype:
         else:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool:
+        if kv_cache_dtype is None:
+            return True
+        if kv_cache_dtype.startswith("fp8"):
+            return flash_attn_supports_fp8()
+        return kv_cache_dtype in ["auto"]
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        if not is_flash_attn_varlen_func_available():
+            return False
+        return flash_attn_supports_sinks()
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability >= DeviceCapability(8, 0)
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        if has_sink and device_capability < DeviceCapability(9, 0):
+            return "sink not supported on compute capability < 9.0"
+        return None
+
 
 @dataclass
 class FlashAttentionMetadata:
@@ -194,7 +225,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
     # to FULL_AND_PIECEWISE.
     # TODO(luka, lucas): audit FA2 as part of:
     #  https://github.com/vllm-project/vllm/issues/22945
-    cudagraph_support = (
+    _cudagraph_support = (
         AttentionCGSupport.ALWAYS
         if get_flash_attn_version() == 3
         else AttentionCGSupport.UNIFORM_BATCH
@@ -481,8 +512,6 @@ def __init__(
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        FlashAttentionBackend.validate_head_size(head_size)
-
         self.attn_type = attn_type
         self.vllm_flash_attn_version = get_flash_attn_version()
         # Cache the batch invariant result for use in forward passes
@@ -675,6 +704,7 @@ def forward(
             logits_soft_cap=self.logits_soft_cap,
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
+            max_num_splits=attn_metadata.max_num_splits,
             fa_version=self.vllm_flash_attn_version,
             prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
             suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
@@ -921,6 +951,7 @@ def cascade_attention(
     logits_soft_cap: float,
     block_table: torch.Tensor,
     common_prefix_len: int,
+    max_num_splits: int,
     fa_version: int,
     prefix_scheduler_metadata: torch.Tensor | None = None,
     suffix_scheduler_metadata: torch.Tensor | None = None,
@@ -965,7 +996,7 @@ def cascade_attention(
         # s_aux is incorporated into prefix_lse inside the GPU kernel,
         # enabling its effect during the final attention merge.
         s_aux=s_aux,
-        num_splits=1 if vllm_is_batch_invariant() else 0,
+        num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
     )
 
     descale_shape = (cu_query_lens.shape[0] - 1, key_cache.shape[-2])
@@ -990,7 +1021,7 @@ def cascade_attention(
         q_descale=q_descale.expand(descale_shape) if q_descale is not None else None,
         k_descale=k_descale.expand(descale_shape) if k_descale is not None else None,
         v_descale=v_descale.expand(descale_shape) if v_descale is not None else None,
-        num_splits=1 if vllm_is_batch_invariant() else 0,
+        num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
     )
 
     # Merge prefix and suffix outputs, and store the result in output.
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 683725b95819..4da1637d96eb 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -10,19 +10,26 @@
 from flashinfer import (
     BatchDecodeWithPagedKVCacheWrapper,
     BatchPrefillWithPagedKVCacheWrapper,
+    BatchPrefillWithRaggedKVCacheWrapper,
     MultiLevelCascadeAttentionWrapper,
 )
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
+from typing_extensions import override
 
+from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
     AttentionType,
     MultipleOf,
 )
+from vllm.attention.ops.common import cp_lse_ag_out_rs
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.config.cache import CacheDType
+from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
@@ -33,6 +40,7 @@
     kNvfp4Quant,
 )
 from vllm.platforms import current_platform
+from vllm.platforms.interface import DeviceCapability
 from vllm.triton_utils import tl, triton
 from vllm.utils.flashinfer import (
     can_use_trtllm_attention,
@@ -45,6 +53,8 @@
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
+    KVCacheLayoutType,
+    get_dcp_local_seq_lens,
     get_kv_cache_layout,
     get_per_layer_parameters,
     infer_global_hyperparameters,
@@ -52,7 +62,6 @@
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANT = 2048 * 1024 * 1024
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -67,7 +76,7 @@ def _get_trtllm_gen_workspace_buffer():
     global trtllm_gen_workspace_buffer
     if trtllm_gen_workspace_buffer is None:
         trtllm_gen_workspace_buffer = torch.zeros(
-            FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device="cuda"
+            envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device="cuda"
         )
     return trtllm_gen_workspace_buffer
 
@@ -156,36 +165,126 @@ def trtllm_prefill_attn_kvfp8_dequant(
     return mock_kv_cache, mock_block_table
 
 
-class FlashInferBackend(AttentionBackend):
-    accept_output_buffer: bool = True
+class BatchDCPPrefillWrapper:
+    def __init__(
+        self,
+        workspace_buffer: torch.Tensor | None = None,
+    ):
+        self._context = BatchPrefillWithPagedKVCacheWrapper(
+            workspace_buffer, get_kv_cache_layout()
+        )
+        self._new_tokens = BatchPrefillWithRaggedKVCacheWrapper(
+            workspace_buffer, get_kv_cache_layout()
+        )
 
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    def plan(
+        self,
+        qo_indptr_cpu: torch.Tensor,
+        paged_kv_indptr_cpu: torch.Tensor,
+        paged_kv_indices: torch.Tensor,
+        paged_kv_last_page_len_cpu: torch.Tensor,
+        prefill_start: int,
+        page_size: int,
+        num_qo_heads: int,
+        dcp_world_size: int,
+        num_kv_heads: int,
+        head_dim: int,
+        sm_scale: float,
+        window_left: int,
+        logits_soft_cap: float | None,
+        q_data_type: torch.dtype,
+        kv_cache_dtype: torch.dtype,
+        prefill_fixed_split_size: int,
+        disable_split_kv: bool,
+    ):
+        """Plan the prefill operation with given parameters."""
+        self._context.plan(
+            qo_indptr_cpu,
+            paged_kv_indptr_cpu,
+            paged_kv_indices,
+            paged_kv_last_page_len_cpu[prefill_start:],
+            num_qo_heads * dcp_world_size,
+            num_kv_heads,
+            head_dim,
+            page_size,
+            causal=False,  # This is context run
+            sm_scale=sm_scale,
+            window_left=window_left,
+            logits_soft_cap=logits_soft_cap,
+            q_data_type=q_data_type,
+            kv_data_type=kv_cache_dtype,
+            fixed_split_size=prefill_fixed_split_size,
+            disable_split_kv=disable_split_kv,
+        )
+        self._new_tokens.plan(
+            qo_indptr=qo_indptr_cpu,
+            kv_indptr=qo_indptr_cpu,
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim_qk=head_dim,
+            head_dim_vo=head_dim,
+            causal=True,  # This is newtokens run
+            sm_scale=sm_scale,
+            window_left=window_left,
+            logits_soft_cap=logits_soft_cap,
+            q_data_type=q_data_type,
+        )
 
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
-        return [64, 128, 256]
+    def run(
+        self,
+        layer: torch.nn.Module,
+        prefill_query: torch.Tensor,
+        kv_cache_permute: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+    ):
+        prefill_query_across_dcp = get_dcp_group().all_gather(
+            prefill_query.contiguous(), dim=1
+        )
+        output_context_tmp, lse_context_tmp = self._context.run(
+            prefill_query_across_dcp,
+            kv_cache_permute,
+            k_scale=layer._k_scale_float,
+            v_scale=layer._v_scale_float,
+            return_lse=True,
+        )
+        output_context, lse_context = cp_lse_ag_out_rs(
+            output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True
+        )
+        lse_context = lse_context.transpose(0, 1).contiguous()
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        # Note: Not sure for all platforms,
-        # but on Blackwell, only support a page size of
-        # 16, 32, 64
-        return [16, 32, 64]
+        output_query, lse_query = self._new_tokens.run(
+            prefill_query,
+            key,
+            value,
+            return_lse=True,
+        )
+        lse_query = lse_query.transpose(0, 1).contiguous()
+
+        merge_attn_states(
+            out,
+            output_context,
+            lse_context,
+            output_query,
+            lse_query,
+        )
+        return out
 
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+
+class FlashInferBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    # Note: Not sure for all platforms,
+    # but on Blackwell, only support a page size of
+    # 16, 32, 64
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+        "fp8_e5m2",
+    ]
 
     @staticmethod
     def get_name() -> str:
@@ -231,6 +330,41 @@ def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
         else:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
+        return [64, 128, 256]
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability >= DeviceCapability(7, 5) and capability <= DeviceCapability(
+            12, 1
+        )
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        """FlashInfer supports sinks when TRTLLM attention is available (SM100)."""
+        from vllm.utils.flashinfer import (
+            force_use_trtllm_attention,
+            supports_trtllm_attention,
+        )
+
+        # Respect explicit disable flag (e.g., VLLM_USE_TRTLLM_ATTENTION=0)
+        if force_use_trtllm_attention() is False:
+            return False
+
+        # Check if TRTLLM is supported on this platform
+        return supports_trtllm_attention()
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> KVCacheLayoutType | None:
+        from vllm.platforms import current_platform
+
+        capability = current_platform.get_device_capability()
+        if capability is not None and capability.major == 10:
+            return "HND"
+        return None
+
 
 @dataclass
 class FlashInferMetadata:
@@ -259,7 +393,9 @@ class FlashInferMetadata:
     # For cascade attention (CPU for planning).
     use_cascade: bool
 
-    prefill_wrapper: BatchPrefillWithPagedKVCacheWrapper | None = None
+    prefill_wrapper: (
+        BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
+    ) = None
     decode_wrapper: BatchDecodeWithPagedKVCacheWrapper | None = None
     cascade_wrapper: MultiLevelCascadeAttentionWrapper | None = None
 
@@ -268,10 +404,6 @@ class FlashInferMetadata:
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
-        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
-    )
-
     reorder_batch_threshold: int = 1
 
     def __init__(
@@ -285,7 +417,9 @@ def __init__(
         self.cache_config = vllm_config.cache_config
         self.model_config = vllm_config.model_config
         self._workspace_buffer = None
-        self._prefill_wrapper = None  # Wrapper for prefill/append
+        self._prefill_wrapper: (
+            BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
+        ) = None  # Wrapper for prefill/append
         self._decode_wrapper = None  # Wrapper for decode (general shape)
 
         if vllm_is_batch_invariant():
@@ -323,12 +457,25 @@ def __init__(
                 self.compilation_config.max_cudagraph_capture_size,
             )
 
-        self.num_qo_heads = self.model_config.get_num_attention_heads(
-            self.vllm_config.parallel_config
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+            self.dcp_kv_cache_interleave_size = (
+                vllm_config.parallel_config.dcp_kv_cache_interleave_size
+            )
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
+            self.dcp_kv_cache_interleave_size = 1
+
+        self.num_qo_heads = (
+            self.model_config.get_num_attention_heads(self.vllm_config.parallel_config)
+            * self.dcp_world_size
         )
+
         self.num_kv_heads = self.kv_cache_spec.num_kv_heads
         self.head_dim = self.kv_cache_spec.head_size
-        FlashInferBackend.validate_head_size(self.head_dim)
         self.page_size = self.kv_cache_spec.block_size
 
         self.cache_dtype = self.cache_config.cache_dtype
@@ -350,6 +497,9 @@ def __init__(
         else:
             self.q_data_type = self.model_config.dtype
 
+        # Prefer TRTLLM attention for decoding in all cases.
+        # This allows us to use AttentionCGSupport.UNIFORM_BATCH mode.
+        self.use_trtllm_decode_attention = can_use_trtllm
         self._init_reorder_batch_threshold(1, supports_spec_as_decode=can_use_trtllm)
 
         self._cascade_wrapper = None  # Wrapper for cascade attention
@@ -407,9 +557,27 @@ def __init__(
                 "passing --block-size 32 or --block-size 64."
             )
 
+    @classmethod
+    @override
+    def get_cudagraph_support(
+        cls: type["FlashInferMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        has_trtllm_support = can_use_trtllm_attention(
+            num_qo_heads=vllm_config.model_config.get_num_attention_heads(
+                vllm_config.parallel_config
+            ),
+            num_kv_heads=kv_cache_spec.num_kv_heads,
+        )
+        if has_trtllm_support:
+            return AttentionCGSupport.UNIFORM_BATCH
+        else:
+            return AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
-            buffer_size = FLASHINFER_WORKSPACE_BUFFER_SIZE
+            buffer_size = envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE
             if vllm_is_batch_invariant():
                 buffer_size = FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANT
             self._workspace_buffer = torch.zeros(
@@ -417,11 +585,19 @@ def _get_workspace_buffer(self):
             )
         return self._workspace_buffer
 
-    def _get_prefill_wrapper(self):
+    def _get_prefill_wrapper(
+        self,
+    ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
         if self._prefill_wrapper is None:
-            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(), get_kv_cache_layout()
-            )
+            if self.dcp_world_size > 1:
+                self._prefill_wrapper = BatchDCPPrefillWrapper(
+                    workspace_buffer=self._get_workspace_buffer(),
+                )
+            else:
+                self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                    self._get_workspace_buffer(), get_kv_cache_layout()
+                )
+        assert self._prefill_wrapper is not None
         return self._prefill_wrapper
 
     def _get_decode_wrapper(self, batch_size: int, use_cudagraph: bool = False):
@@ -488,9 +664,29 @@ def build(
         max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
-        seq_lens_np = seq_lens_cpu.numpy()
         block_table_tensor = common_attn_metadata.block_table_tensor
+        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
+
+        if self.dcp_world_size > 1:
+            if num_prefills > 0:
+                qo_indptr_prefill_cpu = (
+                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
+                )
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
+                )
+                seq_lens_cpu[num_decodes:] = (
+                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
+                )
+
+            seq_lens_cpu = get_dcp_local_seq_lens(
+                seq_lens_cpu,
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.dcp_kv_cache_interleave_size,
+            )
 
+        seq_lens_np = seq_lens_cpu.numpy()
         num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
 
         use_cascade = common_prefix_len > 0
@@ -551,7 +747,7 @@ def build(
         # write self.paged_kv_last_page_len_cpu inplace
         paged_kv_last_page_len_np = seq_lens_np % page_size
         self.paged_kv_last_page_len_np[:num_reqs] = np.where(
-            paged_kv_last_page_len_np == 0,
+            (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0),
             page_size,
             paged_kv_last_page_len_np,
         )
@@ -562,22 +758,15 @@ def build(
             self.num_kv_heads,
             num_prefill_tokens,
             max_seq_len,
+            self.dcp_world_size,
             self.cache_dtype,
             self.q_data_type,
             is_prefill=True,
             has_sinks=self.has_sinks,
             has_spec=uses_spec_reorder,
         )
-        decode_use_trtllm = use_trtllm_attention(
-            self.num_qo_heads,
-            self.num_kv_heads,
-            num_decode_tokens,
-            max_seq_len,
-            self.cache_dtype,
-            self.q_data_type,
-            is_prefill=False,
-            has_sinks=self.has_sinks,
-            has_spec=uses_spec_reorder,
+        decode_use_trtllm = (
+            self.use_trtllm_decode_attention and self.dcp_world_size <= 1
         )
 
         if not (prefill_use_trtllm and decode_use_trtllm):
@@ -623,7 +812,6 @@ def build(
             use_cascade=use_cascade,
         )
 
-        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
         paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[: 1 + num_reqs]
         paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs]
 
@@ -675,24 +863,52 @@ def build(
                 attn_metadata.max_q_len_prefill = int(query_lens_prefill.max().item())
 
                 if not attn_metadata.prefill_use_trtllm:
-                    attn_metadata.prefill_wrapper.plan(
-                        qo_indptr_cpu,
-                        paged_kv_indptr_cpu,
-                        paged_kv_indices,
-                        paged_kv_last_page_len_cpu[prefill_start:],
-                        self.num_qo_heads,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        causal=True,
-                        sm_scale=self.sm_scale,
-                        window_left=self.window_left,
-                        logits_soft_cap=self.logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                        fixed_split_size=self.prefill_fixed_split_size,
-                        disable_split_kv=self.disable_split_kv,
-                    )
+                    if self.dcp_world_size > 1:
+                        assert isinstance(
+                            attn_metadata.prefill_wrapper, BatchDCPPrefillWrapper
+                        )
+                        attn_metadata.prefill_wrapper.plan(
+                            qo_indptr_cpu=qo_indptr_cpu,
+                            paged_kv_indptr_cpu=paged_kv_indptr_cpu,
+                            paged_kv_indices=paged_kv_indices,
+                            paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
+                            prefill_start=prefill_start,
+                            page_size=self.page_size,
+                            num_qo_heads=self.num_qo_heads,
+                            dcp_world_size=self.dcp_world_size,
+                            num_kv_heads=self.num_kv_heads,
+                            head_dim=self.head_dim,
+                            sm_scale=self.sm_scale,
+                            window_left=self.window_left,
+                            logits_soft_cap=self.logits_soft_cap,
+                            q_data_type=self.q_data_type,
+                            kv_cache_dtype=self.kv_cache_dtype,
+                            prefill_fixed_split_size=self.prefill_fixed_split_size,
+                            disable_split_kv=self.disable_split_kv,
+                        )
+                    else:
+                        assert isinstance(
+                            attn_metadata.prefill_wrapper,
+                            BatchPrefillWithPagedKVCacheWrapper,
+                        )
+                        attn_metadata.prefill_wrapper.plan(
+                            qo_indptr_cpu,
+                            paged_kv_indptr_cpu,
+                            paged_kv_indices,
+                            paged_kv_last_page_len_cpu[prefill_start:],
+                            self.num_qo_heads,
+                            self.num_kv_heads,
+                            self.head_dim,
+                            self.page_size,
+                            causal=True,
+                            sm_scale=self.sm_scale,
+                            window_left=self.window_left,
+                            logits_soft_cap=self.logits_soft_cap,
+                            q_data_type=self.q_data_type,
+                            kv_data_type=self.kv_cache_dtype,
+                            fixed_split_size=self.prefill_fixed_split_size,
+                            disable_split_kv=self.disable_split_kv,
+                        )
                 else:
                     attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
                         self.device, non_blocking=True
@@ -742,7 +958,7 @@ def build(
                         paged_kv_indices,
                         self.paged_kv_last_page_len_cpu[:num_input_tokens],
                         seq_lens_cpu[:num_input_tokens],
-                        self.num_qo_heads,
+                        self.num_qo_heads * self.dcp_world_size,
                         self.num_kv_heads,
                         self.head_dim,
                         self.page_size,
@@ -769,6 +985,8 @@ def use_cascade_attention(self, *args, **kwargs) -> bool:
 
 
 class FlashInferImpl(AttentionImpl):
+    can_return_lse_for_decode: bool = True
+
     def __init__(
         self,
         num_heads: int,
@@ -961,6 +1179,8 @@ def forward(
 
         # Inputs and outputs may be padded for CUDA graphs
         query = query[:num_actual_tokens]
+        key = key[:num_actual_tokens]
+        value = value[:num_actual_tokens]
         output_padded = output
         output = output[:num_actual_tokens]
 
@@ -987,17 +1207,46 @@ def forward(
             assert prefill_wrapper is not None
 
             if not attn_metadata.prefill_use_trtllm:
-                assert prefill_wrapper._causal
-                assert prefill_wrapper._window_left == self.window_left
-                assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
-                assert prefill_wrapper._sm_scale == self.scale
-                prefill_wrapper.run(
-                    prefill_query,
-                    kv_cache_permute,
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=output[num_decode_tokens:],
-                )
+                if self.dcp_world_size > 1:
+                    assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
+                    assert prefill_wrapper._context._window_left == self.window_left
+                    assert prefill_wrapper._context._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._context._sm_scale == self.scale
+                    assert not prefill_wrapper._context._causal
+                    assert prefill_wrapper._new_tokens._window_left == self.window_left
+                    assert prefill_wrapper._new_tokens._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._new_tokens._sm_scale == self.scale
+                    assert prefill_wrapper._new_tokens._causal
+
+                    prefill_wrapper.run(
+                        layer,
+                        prefill_query,
+                        kv_cache_permute,
+                        key[num_decode_tokens:],
+                        value[num_decode_tokens:],
+                        out=output[num_decode_tokens:],
+                    )
+                else:
+                    assert isinstance(
+                        prefill_wrapper, BatchPrefillWithPagedKVCacheWrapper
+                    )
+                    assert prefill_wrapper._window_left == self.window_left
+                    assert prefill_wrapper._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._sm_scale == self.scale
+                    assert prefill_wrapper._causal
+                    prefill_wrapper.run(
+                        prefill_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output[num_decode_tokens:],
+                    )
             else:
                 # prefill_query may be non-contiguous
                 prefill_query = prefill_query.contiguous()
@@ -1073,13 +1322,37 @@ def forward(
                 assert decode_wrapper._window_left == self.window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
-                decode_wrapper.run(
-                    decode_query,
-                    kv_cache_permute,
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=output[:num_decode_tokens],
-                )
+
+                if self.dcp_world_size > 1:
+                    decode_query = get_dcp_group().all_gather(
+                        decode_query.contiguous(), dim=-2
+                    )
+                    output_tmp = torch.empty_like(decode_query)
+                    lse = torch.empty(
+                        (decode_query.size(0), decode_query.size(1)),
+                        dtype=torch.float32,
+                        device=decode_query.device,
+                    )
+                    decode_wrapper.run(
+                        decode_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output_tmp,
+                        lse=lse,
+                        return_lse=True,
+                    )
+                    output[:num_decode_tokens] = cp_lse_ag_out_rs(
+                        output_tmp, lse, get_dcp_group()
+                    )
+                else:
+                    decode_wrapper.run(
+                        decode_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output[:num_decode_tokens],
+                    )
             else:
                 # decode_query may be non-contiguous
                 decode_query = decode_query.contiguous()
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 9af63831cecb..7768827d26dc 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -4,6 +4,7 @@
 
 import math
 from dataclasses import dataclass
+from typing import ClassVar
 
 import torch
 import torch._dynamo.decorators
@@ -24,6 +25,7 @@
     is_quantized_kv_cache,
 )
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
@@ -71,19 +73,24 @@ def pad_to_multiple(x: torch.Tensor, multiple: int, dim: int):
 
 class FlexAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16, torch.float32]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        return  # FlexAttention supports any head size
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
 
     @staticmethod
     def get_name() -> str:
         return "FLEX_ATTENTION"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """FlexAttention supports both decoder and encoder-only attention."""
+        from vllm.attention import AttentionType
+
+        return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
+
     @staticmethod
     def get_impl_cls() -> type["FlexAttentionImpl"]:
         return FlexAttentionImpl
@@ -106,6 +113,10 @@ def get_builder_cls() -> type["FlexAttentionMetadataBuilder"]:
     def use_cascade_attention(*args, **kwargs) -> bool:
         return False
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return []
+
 
 # @torch.compile(fullgraph=True, mode="reduce-overhead")
 def physical_to_logical_mapping(
@@ -720,7 +731,6 @@ def __init__(
         if kv_sharing_target_layer_name is not None:
             raise NotImplementedError("FlexAttention does not support kv sharing yet.")
 
-        FlexAttentionBackend.validate_head_size(head_size)
         if is_quantized_kv_cache(self.kv_cache_dtype):
             raise NotImplementedError(
                 "FlexAttention does not support quantized kv-cache. Yet"
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 2ca19646911e..69b5a6fb4856 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -59,7 +59,7 @@ class GDNAttentionMetadata:
 
 
 class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]):
-    cudagraph_support = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support = AttentionCGSupport.UNIFORM_BATCH
 
     reorder_batch_threshold: int = 1
 
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 909af09be255..8e949e53330c 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -123,7 +123,7 @@ def build(
         elif (
             num_decodes > 0
             and num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.full_cuda_graph
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
             padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 4bc1057333a5..888734e5d2b6 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -302,7 +302,7 @@ def build(
 
         elif (
             num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.full_cuda_graph
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
             # Pad state tensor for CUDA graph
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 49d7d6c31b9a..0d875565fc99 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -20,7 +20,7 @@
 
 class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
     reorder_batch_threshold: int = 1
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 40ce12c4bd75..2ccdd1f143ce 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -196,8 +196,9 @@
 import torch
 from tqdm import tqdm
 
-import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionLayer,
@@ -270,28 +271,15 @@ class QueryLenSupport(Enum):
     flashinfer_available = False
 
 
-def is_rocm_aiter_fp8bmm_enabled() -> bool:
-    return (
-        current_platform.is_rocm()
-        and envs.VLLM_ROCM_USE_AITER_FP8BMM
-        and envs.VLLM_ROCM_USE_AITER
-    )
-
-
-if is_rocm_aiter_fp8bmm_enabled():
-    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (  # noqa: E501
-        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm,  # noqa: E501
-    )
-
-    def dynamic_per_batched_tensor_quant(
-        x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
-    ):
-        DTYPE_MAX = torch.finfo(dtype).max
-        min_val, max_val = x.aminmax()
-        amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
-        scale = DTYPE_MAX / amax
-        x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
-        return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+def dynamic_per_batched_tensor_quant(
+    x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
+):
+    DTYPE_MAX = torch.finfo(dtype).max
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
+    scale = DTYPE_MAX / amax
+    x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
 
 
 logger = init_logger(__name__)
@@ -320,25 +308,13 @@ def get_kv_cache_shape(
     ) -> tuple[int, ...]:
         return (num_blocks, block_size, head_size)
 
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [576]
 
     @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+    def is_mla(cls) -> bool:
+        return True
 
 
 @dataclass
@@ -361,6 +337,7 @@ class ChunkedContextMetadata:
         local_context_lens_allranks: list[list[int]] | None = None
         padded_local_cu_seq_lens: torch.Tensor | None = None
         cu_seq_lens_lst: list[list[int]] | None = None
+        chunk_size: int | None = None
 
     block_table: torch.Tensor
     query_start_loc: torch.Tensor
@@ -437,8 +414,10 @@ class MLACommonMetadata(Generic[D]):
     ) = None
 
     def __post_init__(self):
-        if self.head_dim is not None:
-            MLACommonBackend.validate_head_size(self.head_dim)
+        if self.head_dim is not None and not MLACommonBackend.supports_head_size(
+            self.head_dim
+        ):
+            raise ValueError(f"Head dimension {self.head_dim} is not supported by MLA.")
 
 
 M = TypeVar("M", bound=MLACommonMetadata)
@@ -475,12 +454,6 @@ def use_trtllm_ragged_deepseek_prefill() -> bool:
     )
 
 
-# Currently 394MB, this can be tuned based on GEMM sizes used.
-# Chosen to be the same as sglang:
-#  https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37
-FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024
-
-
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
@@ -612,7 +585,9 @@ def __init__(
 
         if self._use_fi_prefill:
             self._workspace_buffer = torch.empty(
-                FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device=device
+                envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=device,
             )
 
             self._fi_prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
@@ -624,7 +599,9 @@ def __init__(
 
         if self._use_trtllm_ragged_prefill:
             self._workspace_buffer = torch.empty(
-                FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device=device
+                envs.VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=device,
             )
 
         if self._use_cudnn_prefill:
@@ -926,6 +903,7 @@ def build(
                             device, non_blocking=True
                         ),
                         cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
+                        chunk_size=padded_local_max_context_chunk_across_ranks,
                     )
                 else:
                     chunked_context_metadata = chunked_context_metadata_cls(
@@ -1010,6 +988,8 @@ def reorg_kvcache(
     local_context_lens_allranks: list[list[int]],
     sum_seq_len: int,
     max_seq_len: int,
+    chunk_size: int,
+    chunk_idx: int,
     toks: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -1025,6 +1005,9 @@ def reorg_kvcache(
         local_context_lens_allranks: local context lengths on each CP rank.
         sum_seq_len: the sum of cp_chunk_seq_lens_lst.
         max_seq_len: the max value of cp_chunk_seq_lens_lst.
+        chunk_size: the local padded max context chunk from
+            chunked_context_metadata building.
+        chunk_idx: chunk idx of chunked_prefill.
         toks: the number of tokens for local gather cache.
     """
     kv_c_segments = []
@@ -1036,20 +1019,31 @@ def reorg_kvcache(
     ):
         cur_seq_len = 0
         for rank, local_context_len in enumerate(local_context_lens):
-            if local_context_len != 0:
+            # Note(qcs): We split the context into multiple chunks,
+            # depending on the size of the workspace.
+            # local_context in dcp0:   |-----------------|
+            # local_context in dcp1:   |--------------|
+            # n*padded_local_chunk:    |-----|-----|-----|
+            # local_chunk_len in dcp1: |-----|-----|--|
+            # so we need update the last chunk length in dcp1.
+            local_chunk_len = min(
+                max(0, local_context_len - chunk_idx * chunk_size),
+                padded_local_chunk_seq_len,
+            )
+            if local_chunk_len != 0:
                 kv_c_segment = allgatered_kv_c_normed[
                     rank * toks + src_token_idx : rank * toks
                     + src_token_idx
-                    + local_context_len
+                    + local_chunk_len
                 ]
                 k_pe_segment = allgatered_k_pe[
                     rank * toks + src_token_idx : rank * toks
                     + src_token_idx
-                    + local_context_len
+                    + local_chunk_len
                 ]
                 kv_c_segments.append(kv_c_segment)
                 k_pe_segments.append(k_pe_segment)
-                cur_seq_len += local_context_len
+                cur_seq_len += local_chunk_len
         max_seq_len_check = max(max_seq_len_check, cur_seq_len)
         src_token_idx += padded_local_chunk_seq_len
     reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
@@ -1109,6 +1103,7 @@ def __init__(
         self.kv_b_proj = kv_b_proj
         self.indexer = indexer
         self.q_pad_num_heads = q_pad_num_heads
+        self.is_aiter_triton_fp8_bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
         def get_layer_weight(layer):
@@ -1158,7 +1153,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1
         )
 
-        if is_rocm_aiter_fp8bmm_enabled():
+        if self.is_aiter_triton_fp8_bmm_enabled:
             W_K = W_UK.transpose(0, 1)  # 16 512 128
             W_V = W_UV.permute(1, 2, 0)  # 16 128 512
             self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
@@ -1187,7 +1182,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                     dtype=torch.bfloat16,
                     device=self.W_K.device,
                 )
-                aiter_triton_fp8_bmm(
+                rocm_aiter_ops.triton_fp8_bmm(
                     x, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True
                 )
 
@@ -1196,7 +1191,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                     dtype=torch.bfloat16,
                     device=self.W_V.device,
                 )
-                aiter_triton_fp8_bmm(
+                rocm_aiter_ops.triton_fp8_bmm(
                     x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
                 )
         else:
@@ -1208,10 +1203,9 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
     def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
-
-        if is_rocm_aiter_fp8bmm_enabled():
+        if self.is_aiter_triton_fp8_bmm_enabled:
             # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
-            x = aiter_triton_fp8_bmm(
+            x = rocm_aiter_ops.triton_fp8_bmm(
                 x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
             )
             # Convert from (B, N, V) to (B, N * V)
@@ -1571,7 +1565,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1
         )
 
-        if is_rocm_aiter_fp8bmm_enabled():
+        if self.is_aiter_triton_fp8_bmm_enabled:
             W_K = W_UK.transpose(0, 1)  # 16 512 128
             W_V = W_UV.permute(1, 2, 0)  # 16 128 512
             self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
@@ -1600,7 +1594,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                     dtype=torch.bfloat16,
                     device=self.W_K.device,
                 )
-                aiter_triton_fp8_bmm(
+                rocm_aiter_ops.triton_fp8_bmm(
                     x, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True
                 )
 
@@ -1609,7 +1603,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                     dtype=torch.bfloat16,
                     device=self.W_V.device,
                 )
-                aiter_triton_fp8_bmm(
+                rocm_aiter_ops.triton_fp8_bmm(
                     x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
                 )
         else:
@@ -1700,6 +1694,7 @@ def _context_parallel_compute_prefill_context(
         assert prefill_metadata.chunked_context.local_context_lens_allranks is not None
         assert prefill_metadata.chunked_context.padded_local_cu_seq_lens is not None
         assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None
+        assert prefill_metadata.chunked_context.chunk_size is not None
 
         output = None
         iters = len(prefill_metadata.chunked_context.seq_tot)
@@ -1749,6 +1744,8 @@ def _context_parallel_compute_prefill_context(
                 local_context_lens_allranks=prefill_metadata.chunked_context.local_context_lens_allranks,
                 sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i][-1],
                 max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
+                chunk_size=prefill_metadata.chunked_context.chunk_size,
+                chunk_idx=i,
                 toks=toks,
             )
 
@@ -1958,7 +1955,6 @@ def forward(
             # Convert from (B, N, P) to (N, B, P)
             decode_q_nope = decode_q_nope.transpose(0, 1)
 
-            # Pads the head_dim if necessary (for the underlying kernel)
             if self.q_pad_num_heads is not None:
                 B, N, L = decode_q_pe.shape
                 decode_pe_padded = decode_q_pe.new_empty((B, self.q_pad_num_heads, L))
@@ -1966,9 +1962,9 @@ def forward(
                 decode_pe_padded.copy_(decode_q_pe)
                 decode_q_pe = decode_pe_padded
 
-            if is_rocm_aiter_fp8bmm_enabled():
+            if self.is_aiter_triton_fp8_bmm_enabled:
                 # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
-                decode_ql_nope = aiter_triton_fp8_bmm(
+                decode_ql_nope = rocm_aiter_ops.triton_fp8_bmm(
                     decode_q_nope,
                     self.W_K,
                     self.W_K_scale,
@@ -2023,7 +2019,7 @@ def forward(
                 decode_q, kv_cache, attn_metadata, layer
             )
 
-            # recorect dcp attn_out with lse.
+            # correct dcp attn_out with lse.
             if self.dcp_world_size > 1:
                 attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
 
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index c35e238eac4c..60cb5022a55e 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -13,7 +13,9 @@
     MultipleOf,
     is_quantized_kv_cache,
 )
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonImpl,
@@ -27,12 +29,20 @@
 
 class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
     # enable full CUDA Graph support for decode-only capture
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
 
 class CutlassMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [128]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+    ]
+
     @staticmethod
     def get_name() -> str:
         return "CUTLASS_MLA"
@@ -45,9 +55,9 @@ def get_impl_cls() -> type["CutlassMLAImpl"]:
     def get_builder_cls() -> type["CutlassMLAMetadataBuilder"]:
         return CutlassMLAMetadataBuilder
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [128]
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major == 10
 
 
 class SM100Workspace:
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index 79b89c7890a2..7794e89cc0a9 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -10,6 +10,7 @@
 from vllm.attention.backends.abstract import (
     AttentionLayer,
     AttentionType,
+    MultipleOf,
     is_quantized_kv_cache,
 )
 from vllm.attention.utils.fa_utils import (
@@ -17,10 +18,12 @@
     get_flash_attn_version,
 )
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
@@ -37,6 +40,10 @@
 
 
 class FlashAttnMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
+
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN_MLA"
@@ -49,6 +56,26 @@ def get_builder_cls() -> type["FlashAttnMLAMetadataBuilder"]:
     def get_impl_cls() -> type["FlashAttnMLAImpl"]:
         return FlashAttnMLAImpl
 
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major == 9
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        if not flash_attn_supports_mla():
+            return "FlashAttention MLA not supported on this device"
+        return None
+
 
 @dataclass
 class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata):
@@ -65,7 +92,7 @@ class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]):
 
 
 class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
     query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.VARLEN
     reorder_batch_threshold: int = 512  # process small prefills with decode pathway
 
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index ebbcfd0eaa2f..52bb19e039e4 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -6,8 +6,14 @@
 import torch
 from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 
-from vllm.attention.backends.abstract import AttentionLayer, AttentionType, MultipleOf
+from vllm.attention.backends.abstract import (
+    AttentionLayer,
+    AttentionType,
+    MultipleOf,
+)
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonImpl,
@@ -15,7 +21,7 @@
     MLACommonMetadataBuilder,
     QueryLenSupport,
 )
-from vllm.v1.attention.backends.utils import AttentionCGSupport
+from vllm.v1.attention.backends.utils import AttentionCGSupport, KVCacheLayoutType
 
 logger = init_logger(__name__)
 
@@ -23,11 +29,19 @@
 
 
 class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
     query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
 
 
 class FlashInferMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [32, 64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+    ]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER_MLA"
@@ -41,8 +55,12 @@ def get_builder_cls() -> type["FlashInferMLAMetadataBuilder"]:
         return FlashInferMLAMetadataBuilder
 
     @classmethod
-    def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
-        return [32, 64]
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major == 10
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
+        return "HND"
 
 
 g_fi_workspace = torch.zeros(
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 708bb9d63839..3aab1f9bb7fb 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -13,10 +13,12 @@
     is_flashmla_dense_supported,
 )
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
@@ -36,6 +38,14 @@
 
 
 class FlashMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+    ]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHMLA"
@@ -48,9 +58,30 @@ def get_builder_cls() -> type["FlashMLAMetadataBuilder"]:
     def get_impl_cls() -> type["FlashMLAImpl"]:
         return FlashMLAImpl
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [64]
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major in [9, 10]
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: CacheDType | None,
+        block_size: int,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: DeviceCapability,
+    ) -> str | None:
+        if use_sparse:
+            from vllm.attention.ops.flashmla import is_flashmla_sparse_supported
+
+            return is_flashmla_sparse_supported()[1]
+        else:
+            from vllm.attention.ops.flashmla import is_flashmla_dense_supported
+
+            return is_flashmla_dense_supported()[1]
 
 
 @dataclass
@@ -65,7 +96,7 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
     query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
     reorder_batch_threshold: int = 128  # process small prefills with decode pathway
     # ^ TODO(matt): tune this
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index bf76549de1ce..bb8d914d1571 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -10,6 +10,7 @@
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionLayer,
+    MultipleOf,
 )
 from vllm.attention.backends.utils import get_mla_dims
 from vllm.attention.ops.flashmla import (
@@ -18,8 +19,10 @@
     get_mla_metadata,
 )
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.platforms.interface import DeviceCapability
 from vllm.triton_utils import tl, triton
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl
@@ -37,20 +40,23 @@
 """
 NOTE: FlashMLA Sparse uses an fp8 cache with the following format
 
-In the "FP8 with scale" format, each token's KV cache is 656 Bytes, 
+In the "FP8 with scale" format, each token's KV cache is 656 Bytes,
 structured as:
--   **First 512 bytes:** The "quantized NoPE" part, containing 512 
+-   **First 512 bytes:** The "quantized NoPE" part, containing 512
     `float8_e4m3` values.
--   **Next 16 bytes:** Scale factors, containing 4 `float32` values. 
-    The first `float32` is the scale for the first 128 `float8_e4m3` values, 
+-   **Next 16 bytes:** Scale factors, containing 4 `float32` values.
+    The first `float32` is the scale for the first 128 `float8_e4m3` values,
     the second for the next 128, and so on.
--   **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This 
+-   **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This
     part is not quantized for accuracy.
 """
 
 
 class FlashMLASparseBackend(AttentionBackend):
     accept_output_buffer: bool = True
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto", "fp8_ds_mla"]
 
     @staticmethod
     def get_name() -> str:
@@ -64,6 +70,22 @@ def get_builder_cls() -> type["FlashMLASparseMetadataBuilder"]:
     def get_impl_cls() -> type["FlashMLASparseImpl"]:
         return FlashMLASparseImpl
 
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [576]
+
+    @classmethod
+    def is_mla(cls) -> bool:
+        return True
+
+    @classmethod
+    def is_sparse(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return capability.major in [9, 10]
+
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -79,14 +101,6 @@ def get_kv_cache_shape(
         else:
             return (num_blocks, block_size, head_size)
 
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.bfloat16]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [576]
-
 
 @dataclass
 class FlashMLASparseMetadata:
@@ -234,7 +248,7 @@ def triton_convert_req_index_to_global_index(
 
 @dataclass
 class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
 
     def __init__(
         self,
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index f3c5bb732871..37aa5dad89a0 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -23,6 +23,8 @@
 
 
 class DeepseekV32IndexerBackend(AttentionBackend):
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 128]
@@ -46,10 +48,6 @@ def get_kv_cache_shape(
     def get_kv_cache_stride_order() -> tuple[int, ...]:
         return (0, 1, 2)
 
-    @classmethod
-    def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
-        return [64]
-
 
 @dataclass
 class DeepseekV32IndexerPrefillChunkMetadata:
@@ -208,7 +206,7 @@ def split_prefill_chunks(
 
 
 class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 4ad7236eb1be..e1864526f02c 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -6,9 +6,8 @@
 
 import torch
 
-import vllm.envs as envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.attention.backends.abstract import AttentionLayer
-from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
 from vllm.config import VllmConfig
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.common import (
@@ -22,10 +21,6 @@
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 
-def is_aiter_mla_enabled() -> bool:
-    return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA
-
-
 class AiterMLABackend(MLACommonBackend):
     @staticmethod
     def get_name() -> str:
@@ -60,7 +55,7 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     # TODO(luka, lucas): audit this as part of:
     #  https://github.com/vllm-project/vllm/issues/22945
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
+    _cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
@@ -284,7 +279,7 @@ def _forward_decode(
         # max_seqlen_qo must be 1 except for MTP
         # TODO: Find the best value for MTP
         max_seqlen_qo = 1
-        aiter_mla_decode_fwd(
+        rocm_aiter_ops.mla_decode_fwd(
             q,
             kv_buffer,
             o,
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 781f77e96319..54ad3acb93ed 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,23 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import ClassVar
 
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionLayer,
     AttentionType,
     is_quantized_kv_cache,
 )
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
-from vllm.attention.ops.triton_flash_attention import triton_attention
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
-from vllm.platforms import current_platform
-from vllm.triton_utils import HAS_TRITON
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonImpl,
@@ -28,6 +27,9 @@
 
 
 class TritonMLABackend(MLACommonBackend):
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
+
     @staticmethod
     def get_name() -> str:
         return "TRITON_MLA"
@@ -36,6 +38,10 @@ def get_name() -> str:
     def get_impl_cls() -> type["TritonMLAImpl"]:
         return TritonMLAImpl
 
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return True
+
 
 class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
     can_return_lse_for_decode: bool = True
@@ -89,55 +95,18 @@ def __init__(
                 "TritonMLA V1 with FP8 KV cache not yet supported"
             )
 
-        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
-        self.triton_fa_func = triton_attention if HAS_TRITON else None
-
-    def _flash_attn_varlen_diff_headdims_rocm(
-        self, q, k, v, softmax_scale=None, **kwargs
+    def _flash_attn_varlen_diff_headdims(
+        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
     ):
-        assert self.triton_fa_func is not None
-
-        # Triton Attention requires a padded V
-        padded_v = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]], value=0)
-        # The output of triton_attention is a tuple of
-        # [output_tensor, encoded_softmax] where encoded_softmax is always None
-        output_tensor, _ = self.triton_fa_func(
+        return super()._flash_attn_varlen_diff_headdims(
             q,
             k,
-            padded_v,
-            None,  # output
-            kwargs["cu_seqlens_q"],
-            kwargs["cu_seqlens_k"],
-            kwargs["max_seqlen_q"],
-            kwargs["max_seqlen_k"],
-            kwargs["causal"],
-            softmax_scale,
-            None,  # bias
+            v,
+            return_softmax_lse=return_softmax_lse,
+            softmax_scale=softmax_scale,
+            **kwargs,
         )
 
-        return output_tensor
-
-    def _flash_attn_varlen_diff_headdims(
-        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
-    ):
-        if (
-            current_platform.is_rocm()
-            and self.use_triton_flash_attn
-            and not return_softmax_lse
-        ):
-            return self._flash_attn_varlen_diff_headdims_rocm(
-                q, k, v, softmax_scale=softmax_scale, **kwargs
-            )
-        else:
-            return super()._flash_attn_varlen_diff_headdims(
-                q,
-                k,
-                v,
-                return_softmax_lse=return_softmax_lse,
-                softmax_scale=softmax_scale,
-                **kwargs,
-            )
-
     def _forward_decode(
         self,
         q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index e8d3758a6395..ea611848b0e8 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -3,6 +3,7 @@
 """Attention layer with AiterFlashAttention."""
 
 from dataclasses import dataclass
+from typing import ClassVar
 
 import torch
 
@@ -17,6 +18,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
+from vllm.utils.platform_utils import get_cu_count
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -30,15 +32,14 @@
 
 if current_platform.is_rocm():
     import aiter
-    from aiter.ops.triton.utils.device_info import get_num_sms
 
     from vllm.triton_utils import tl, triton
 
     def block_size(x, head_dim):
         return min(65536 // x.element_size(), triton.next_power_of_2(head_dim))
 
-    def num_programs(head_dim):
-        return min(head_dim, get_num_sms())
+    def num_programs(total_tokens):
+        return min(total_tokens, get_cu_count())
 
     @triton.jit
     def cp_mha_gather_cache_kernel(
@@ -57,11 +58,11 @@ def cp_mha_gather_cache_kernel(
         x,
         max_block_num,
         num_tokens,
+        num_programs,
         DEQUANT: tl.constexpr,
         PAGE_SIZE: tl.constexpr,
         CACHE_FORMAT: tl.constexpr,
         BLOCK_SIZE: tl.constexpr,
-        NUM_PRGMS: tl.constexpr,
     ):
         bid = tl.program_id(0)
         col_offsets = tl.arange(0, BLOCK_SIZE)
@@ -69,7 +70,7 @@ def cp_mha_gather_cache_kernel(
             k_scale = tl.load(k_scale_ptr)
             v_scale = tl.load(v_scale_ptr)
 
-        for token_id in tl.range(bid, num_tokens, NUM_PRGMS):
+        for token_id in tl.range(bid, num_tokens, num_programs):
             key_ptr_offset = key_ptr + token_id * head_size * num_heads
             value_ptr_offset = value_ptr + token_id * head_size * num_heads
             batch_idx = tl.load(token_to_batch_ptr + token_id)
@@ -161,11 +162,11 @@ def cp_mha_gather_cache(
             x,
             block_tables.size(1),
             total_tokens,
+            NUM_PRGMS,
             DEQUANT=dequant,
             PAGE_SIZE=page_size,
             CACHE_FORMAT=kv_cache_layout,
             BLOCK_SIZE=BLOCK_SIZE,
-            NUM_PRGMS=NUM_PRGMS,
         )
 
 
@@ -250,7 +251,7 @@ class AiterFlashAttentionMetadata:
 class AiterFlashAttentionMetadataBuilder(
     AttentionMetadataBuilder[AiterFlashAttentionMetadata]
 ):
-    cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    _cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     reorder_batch_threshold: int = 1
 
     def __init__(
@@ -445,31 +446,13 @@ def use_cascade_attention(self, *args, **kwargs) -> bool:
 
 class AiterFlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [64, 128, 256]
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
-
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN"
@@ -531,8 +514,6 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        AiterFlashAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError(
                 "Encoder self-attention and "
@@ -748,7 +729,7 @@ def forward(
                     cu_seqlens_k=attn_metadata.prefill_metadata.query_start_loc,
                     max_seqlen_q=attn_metadata.prefill_metadata.max_query_len,
                     max_seqlen_k=attn_metadata.prefill_metadata.max_seq_len,
-                    min_seqlen_q=attn_metadata.prefill_metadata.min_query_len,
+                    min_seqlen_q=1,
                     dropout_p=0.0,
                     softmax_scale=self.scale,
                     causal=True,
@@ -778,7 +759,7 @@ def forward(
                     cu_seqlens_q=attn_metadata.extend_metadata.query_start_loc,
                     max_seqlen_q=attn_metadata.extend_metadata.max_query_len,
                     max_seqlen_k=attn_metadata.extend_metadata.max_seq_len,
-                    min_seqlen_q=attn_metadata.extend_metadata.min_query_len,
+                    min_seqlen_q=1,
                     block_table=attn_metadata.block_table[
                         num_decodes : num_decodes + num_extends
                     ],
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 57ba4dc78d9f..6dfdfc19ccba 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -63,7 +63,7 @@ class RocmAttentionMetadata:
 
 
 class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
 
     def __init__(
         self,
@@ -152,10 +152,7 @@ def build(
 
 class RocmAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -163,12 +160,11 @@ def get_supported_head_sizes(cls) -> list[int]:
 
     @classmethod
     def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
+        if not cls.supports_head_size(head_size):
             attn_type = cls.__name__.removesuffix("Backend")
             raise ValueError(
                 f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
+                f"Supported head sizes are: {cls.get_supported_head_sizes()}. "
                 "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
                 "FlexAttention backend which supports all head sizes."
             )
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index 22ad1054b35e..de0cb73db091 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -81,7 +81,7 @@ def build(
         elif (
             num_decodes > 0
             and num_decodes <= self.decode_cudagraph_max_bs
-            and self.compilation_config.full_cuda_graph
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
         ):
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
             self.state_indices_tensor[:num_decodes].copy_(
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 0c0222d6152f..1bf38ed225a4 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -4,7 +4,7 @@
 
 import ast
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar, Optional
 
 import torch
 
@@ -30,31 +30,13 @@
 
 class TreeAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
-
     @staticmethod
     def get_name() -> str:
         return "TREE_ATTN"
@@ -331,8 +313,6 @@ def __init__(
         else:
             self.sliding_window = (sliding_window - 1, 0)
 
-        TreeAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError(
                 "Encoder self-attention and "
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 0590a87bf8e5..889c79db18ef 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -18,12 +18,14 @@
 )
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     kFp8StaticTensorSym,
 )
 from vllm.platforms import current_platform
+from vllm.platforms.interface import DeviceCapability
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -65,7 +67,7 @@ class TritonAttentionMetadata:
 
 
 class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
 
     def __init__(
         self,
@@ -147,25 +149,18 @@ def build(
 
 class TritonAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16, torch.float32]
-
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        # Triton Attention supports any head size above 32
-        if head_size < 32:
-            raise ValueError(
-                f"Head size {head_size} is not supported by TritonAttention."
-                f"Head sizes need to be larger or equal 32 for this backend. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
+    supported_dtypes: ClassVar[list[torch.dtype]] = [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "fp8",
+        "fp8_e4m3",
+        "fp8_e5m2",
+    ]
 
     @staticmethod
     def get_name() -> str:
@@ -195,6 +190,18 @@ def use_cascade_attention(*args, **kwargs) -> bool:
     def get_builder_cls() -> type["TritonAttentionMetadataBuilder"]:
         return TritonAttentionMetadataBuilder
 
+    @classmethod
+    def supports_head_size(cls, head_size: int) -> bool:
+        return head_size >= 32
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
+        return True
+
 
 class TritonAttentionImpl(AttentionImpl):
     def fused_output_quant_supported(self, quant_key: QuantKey):
@@ -237,8 +244,6 @@ def __init__(
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        TritonAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError(
                 "Encoder self-attention and "
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 07dfbc766acd..578153cda786 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -244,7 +244,8 @@ class AttentionCGSupport(enum.Enum):
 
 class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     # Does this backend/builder support CUDA Graphs for attention (default: no).
-    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+    # Do not access directly. Call get_cudagraph_support() instead.
+    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
     # Does this backend/builder reorder the batch?
     # If not, set this to None. Otherwise set it to the query
     # length that will be pulled into the front of the batch.
@@ -263,9 +264,18 @@ def __init__(
         self.vllm_config = vllm_config
         self.device = device
 
+    @classmethod
+    def get_cudagraph_support(
+        cls: type["AttentionMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        """Get the cudagraph support level of this builder class."""
+        return cls._cudagraph_support
+
     def _init_reorder_batch_threshold(
         self,
-        reorder_batch_threshold: int = 1,
+        reorder_batch_threshold: int | None = 1,
         supports_spec_as_decode: bool = False,
         supports_dcp_with_varlen: bool = False,
     ) -> None:
@@ -955,12 +965,6 @@ def reshape_attn_output_for_spec_decode(attn_output: torch.Tensor) -> torch.Tens
     return attn_output.view(total_tokens, attn_output.shape[2], attn_output.shape[3])
 
 
-KV_SHARING_FAST_PREFILL_METADATA_FIELDS = [
-    ("logits_indices_padded", torch.Tensor | None, None),
-    ("num_logits_indices", int, 0),
-]
-
-
 def subclass_attention_metadata(
     name_prefix: str,
     metadata_cls: Any,
@@ -976,8 +980,8 @@ def subclass_attention_metadata(
 
 @runtime_checkable
 class KVSharingFastPrefillMetadata(Protocol):
-    logits_indices_padded: torch.Tensor
-    num_logits_indices: int
+    logits_indices_padded: torch.Tensor | None = None
+    num_logits_indices: int | None = None
 
 
 def create_fast_prefill_custom_backend(
@@ -1009,11 +1013,6 @@ def __init__(self, metadata, common_attn_metadata):
                     for _field in fields(metadata.__class__):
                         setattr(self, _field.name, getattr(metadata, _field.name))
 
-                    # Set additional fields that will be used in model code
-                    assert (
-                        common_attn_metadata.logits_indices_padded is not None
-                        and common_attn_metadata.num_logits_indices is not None
-                    )
                     self.logits_indices_padded = (
                         common_attn_metadata.logits_indices_padded
                     )
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index 81bdbd641429..d15d79417cc6 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -3,7 +3,7 @@
 """Attention layer with XFormersAttention."""
 
 from dataclasses import dataclass
-from typing import Optional
+from typing import ClassVar, Optional
 
 import torch
 
@@ -41,10 +41,8 @@
 
 class XFormersAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -80,22 +78,6 @@ def get_supported_head_sizes(cls) -> list[int]:
             256,
         ]
 
-    @staticmethod
-    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes."
-            )
-
     @staticmethod
     def get_name() -> str:
         return "XFORMERS"
@@ -305,8 +287,6 @@ def __init__(
             logits_soft_cap = 0
         self.logits_soft_cap = logits_soft_cap
 
-        XFormersAttentionBackend.validate_head_size(head_size)
-
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError(
                 "Encoder self-attention and "
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 63a1ff06e404..7f405fc248ac 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -185,12 +185,11 @@ def get_computed_blocks(self, request: Request) -> tuple[KVCacheBlocks, int]:
                 - A list of blocks that are computed for the request.
                 - The number of computed tokens.
         """
-        # Prefix caching is disabled or
-        # When the request requires prompt logprobs, we skip prefix caching.
-        if not self.enable_caching or (
-            request.sampling_params is not None
-            and request.sampling_params.prompt_logprobs is not None
-        ):
+        # We skip finding the prefix cache hit when prefix caching is
+        # disabled or the request is marked as skipping kv cache read
+        # (which happens when the request requires prompt logprobs
+        # or calls a pooling model with all pooling).
+        if not self.enable_caching or request.skip_reading_prefix_cache:
             return self.empty_kv_cache_blocks, 0
 
         # NOTE: When all tokens hit the cache, we must recompute the last token
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
index 0ad994c360b0..3214f65a0972 100644
--- a/vllm/v1/core/sched/async_scheduler.py
+++ b/vllm/v1/core/sched/async_scheduler.py
@@ -16,18 +16,25 @@ def _update_after_schedule(
     ) -> None:
         super()._update_after_schedule(scheduler_output)
         pending_structured_output_tokens = False
+        spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
         for req_id in scheduler_output.num_scheduled_tokens:
             request = self.requests[req_id]
             pending_structured_output_tokens |= (
                 request.use_structured_output and request.num_output_placeholders > 0
             )
+            cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, ()))
             if (
                 request.num_computed_tokens
-                == request.num_tokens + request.num_output_placeholders
+                == request.num_tokens
+                + request.num_output_placeholders
+                + cur_num_spec_tokens
             ):
-                # The request will generate a new token in this scheduling step.
-                # TODO(woosuk): Support speculative decoding.
-                request.num_output_placeholders += 1
+                # The request will generate a new token plus num_spec_tokens
+                # in this scheduling step.
+                request.num_output_placeholders += 1 + cur_num_spec_tokens
+                # Add placeholders for the new tokens in spec_token_ids.
+                # Wwe will update the actual spec token ids in the worker process.
+                request.spec_token_ids = [-1] * self.num_spec_tokens
 
         scheduler_output.pending_structured_output_tokens = (
             pending_structured_output_tokens
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 866136648bcb..20fdb3446404 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -14,6 +14,7 @@
     import numpy.typing as npt
     import torch
 
+    from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
     from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
     from vllm.lora.request import LoRARequest
     from vllm.multimodal.inputs import MultiModalFeatureSpec
@@ -21,6 +22,7 @@
     from vllm.sampling_params import SamplingParams
     from vllm.v1.request import Request
 else:
+    ECConnectorMetadata = object
     KVConnectorMetadata = object
     LoRARequest = object
     MultiModalFeatureSpec = object
@@ -188,6 +190,9 @@ class SchedulerOutput:
     # KV Cache Connector metadata.
     kv_connector_metadata: KVConnectorMetadata | None = None
 
+    # EC Cache Connector metadata
+    ec_connector_metadata: ECConnectorMetadata | None = None
+
 
 @dataclass
 class GrammarOutput:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index c17b19b58c97..4323141c435b 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,6 +7,11 @@
 from typing import Any
 
 from vllm.config import VllmConfig
+from vllm.distributed.ec_transfer.ec_connector.base import (
+    ECConnectorMetadata,
+    ECConnectorRole,
+)
+from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory
 from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
@@ -14,6 +19,7 @@
     KVConnectorRole,
     SupportsHMA,
 )
+from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
@@ -38,6 +44,7 @@
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
+from vllm.v1.utils import record_function_or_nullcontext
 
 logger = init_logger(__name__)
 
@@ -76,7 +83,7 @@ def __init__(
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
         self.max_num_scheduled_tokens = self.scheduler_config.max_num_batched_tokens
-        self.max_model_len = self.scheduler_config.max_model_len
+        self.max_model_len = vllm_config.model_config.max_model_len
         self.enable_kv_cache_events = (
             self.kv_events_config is not None
             and self.kv_events_config.enable_kv_cache_events
@@ -103,6 +110,11 @@ def __init__(
             self.kv_events_config,
             self.parallel_config.data_parallel_rank,
         )
+        self.ec_connector = None
+        if self.vllm_config.ec_transfer_config is not None:
+            self.ec_connector = ECConnectorFactory.create_connector(
+                config=self.vllm_config, role=ECConnectorRole.SCHEDULER
+            )
 
         num_gpu_blocks = self.cache_config.num_gpu_blocks
         assert num_gpu_blocks is not None and num_gpu_blocks > 0
@@ -229,12 +241,14 @@ def schedule(self) -> SchedulerOutput:
 
             # Schedule encoder inputs.
             encoder_inputs_to_schedule = None
+            external_load_encoder_input: list[int] = []
             new_encoder_compute_budget = encoder_compute_budget
             if request.has_encoder_inputs:
                 (
                     encoder_inputs_to_schedule,
                     num_new_tokens,
                     new_encoder_compute_budget,
+                    external_load_encoder_input,
                 ) = self._try_schedule_encoder_inputs(
                     request,
                     request.num_computed_tokens,
@@ -259,49 +273,66 @@ def schedule(self) -> SchedulerOutput:
                 continue
 
             # Schedule newly needed KV blocks for the request.
-            while True:
-                new_blocks = self.kv_cache_manager.allocate_slots(
-                    request,
-                    num_new_tokens,
-                    num_lookahead_tokens=self.num_lookahead_tokens,
-                )
-
-                if new_blocks is not None:
-                    # The request can be scheduled.
-                    break
-
-                # The request cannot be scheduled.
-                # Preempt the lowest-priority request.
-                if self.policy == SchedulingPolicy.PRIORITY:
-                    preempted_req = max(
-                        self.running,
-                        key=lambda r: (r.priority, r.arrival_time),
+            with record_function_or_nullcontext("schedule: allocate_slots"):
+                while True:
+                    new_blocks = self.kv_cache_manager.allocate_slots(
+                        request,
+                        num_new_tokens,
+                        num_lookahead_tokens=self.num_lookahead_tokens,
                     )
-                    self.running.remove(preempted_req)
-                    if preempted_req in scheduled_running_reqs:
-                        scheduled_running_reqs.remove(preempted_req)
-                        token_budget += num_scheduled_tokens[preempted_req.request_id]
-                        req_to_new_blocks.pop(preempted_req.request_id)
-                        num_scheduled_tokens.pop(preempted_req.request_id)
-                        req_index -= 1
-                else:
-                    preempted_req = self.running.pop()
 
-                self.kv_cache_manager.free(preempted_req)
-                self.encoder_cache_manager.free(preempted_req)
-                preempted_req.status = RequestStatus.PREEMPTED
-                preempted_req.num_computed_tokens = 0
-                preempted_req.num_preemptions += 1
-                if self.log_stats:
-                    preempted_req.record_event(
-                        EngineCoreEventType.PREEMPTED, scheduled_timestamp
-                    )
+                    if new_blocks is not None:
+                        # The request can be scheduled.
+                        break
 
-                self.waiting.prepend_request(preempted_req)
-                preempted_reqs.append(preempted_req)
-                if preempted_req == request:
-                    # No more request to preempt. Cannot schedule this request.
-                    break
+                    # The request cannot be scheduled.
+                    # Preempt the lowest-priority request.
+                    if self.policy == SchedulingPolicy.PRIORITY:
+                        preempted_req = max(
+                            self.running,
+                            key=lambda r: (r.priority, r.arrival_time),
+                        )
+                        self.running.remove(preempted_req)
+                        if preempted_req in scheduled_running_reqs:
+                            scheduled_running_reqs.remove(preempted_req)
+                            token_budget += num_scheduled_tokens[
+                                preempted_req.request_id
+                            ]
+                            req_to_new_blocks.pop(preempted_req.request_id)
+                            num_scheduled_tokens.pop(preempted_req.request_id)
+                            scheduled_spec_decode_tokens.pop(
+                                preempted_req.request_id, None
+                            )
+                            preempted_encoder_inputs = scheduled_encoder_inputs.pop(
+                                preempted_req.request_id, None
+                            )
+                            if preempted_encoder_inputs:
+                                # Restore encoder compute budget if the preempted
+                                # request had encoder inputs scheduled in this step.
+                                num_tokens_to_restore = sum(
+                                    preempted_req.get_num_encoder_tokens(i)
+                                    for i in preempted_encoder_inputs
+                                )
+                                encoder_compute_budget += num_tokens_to_restore
+                            req_index -= 1
+                    else:
+                        preempted_req = self.running.pop()
+
+                    self.kv_cache_manager.free(preempted_req)
+                    self.encoder_cache_manager.free(preempted_req)
+                    preempted_req.status = RequestStatus.PREEMPTED
+                    preempted_req.num_computed_tokens = 0
+                    preempted_req.num_preemptions += 1
+                    if self.log_stats:
+                        preempted_req.record_event(
+                            EngineCoreEventType.PREEMPTED, scheduled_timestamp
+                        )
+
+                    self.waiting.prepend_request(preempted_req)
+                    preempted_reqs.append(preempted_req)
+                    if preempted_req == request:
+                        # No more request to preempt. Cannot schedule this request.
+                        break
 
             if new_blocks is None:
                 # Cannot schedule this request.
@@ -317,7 +348,10 @@ def schedule(self) -> SchedulerOutput:
             # Speculative decode related.
             if request.spec_token_ids:
                 num_scheduled_spec_tokens = (
-                    num_new_tokens + request.num_computed_tokens - request.num_tokens
+                    num_new_tokens
+                    + request.num_computed_tokens
+                    - request.num_tokens
+                    - request.num_output_placeholders
                 )
                 if num_scheduled_spec_tokens > 0:
                     # Trim spec_token_ids list to num_scheduled_spec_tokens.
@@ -338,6 +372,11 @@ def schedule(self) -> SchedulerOutput:
                 for i in encoder_inputs_to_schedule:
                     self.encoder_cache_manager.allocate(request, i)
                 encoder_compute_budget = new_encoder_compute_budget
+            if external_load_encoder_input:
+                for i in external_load_encoder_input:
+                    self.encoder_cache_manager.allocate(request, i)
+                    if self.ec_connector is not None:
+                        self.ec_connector.update_state_after_alloc(request, i)
 
         # Record the LoRAs in scheduled_running_reqs
         scheduled_loras: set[int] = set()
@@ -433,22 +472,23 @@ def schedule(self) -> SchedulerOutput:
                     num_computed_tokens = (
                         num_new_local_computed_tokens + num_external_computed_tokens
                     )
-                # KVTransfer: WAITING reqs have num_computed_tokens > 0
-                # after async KV recvs are completed.
                 else:
+                    # KVTransfer: WAITING reqs have num_computed_tokens > 0
+                    # after async KV recvs are completed.
                     new_computed_blocks = self.kv_cache_manager.empty_kv_cache_blocks
                     num_new_local_computed_tokens = 0
                     num_computed_tokens = request.num_computed_tokens
 
                 encoder_inputs_to_schedule = None
+                external_load_encoder_input = []
                 new_encoder_compute_budget = encoder_compute_budget
 
-                # KVTransfer: loading remote KV, do not allocate for new work.
                 if load_kv_async:
+                    # KVTransfer: loading remote KV, do not allocate for new work.
                     assert num_external_computed_tokens > 0
                     num_new_tokens = 0
-                # Number of tokens to be scheduled.
                 else:
+                    # Number of tokens to be scheduled.
                     # We use `request.num_tokens` instead of
                     # `request.num_prompt_tokens` to consider the resumed
                     # requests, which have output tokens.
@@ -460,7 +500,7 @@ def schedule(self) -> SchedulerOutput:
                     # chunked prefill has to be enabled explicitly to allow
                     # pooling requests to be chunked
                     if (
-                        not self.scheduler_config.chunked_prefill_enabled
+                        not self.scheduler_config.enable_chunked_prefill
                         and num_new_tokens > token_budget
                     ):
                         self.waiting.pop_request()
@@ -476,6 +516,7 @@ def schedule(self) -> SchedulerOutput:
                             encoder_inputs_to_schedule,
                             num_new_tokens,
                             new_encoder_compute_budget,
+                            external_load_encoder_input,
                         ) = self._try_schedule_encoder_inputs(
                             request,
                             num_computed_tokens,
@@ -579,7 +620,12 @@ def schedule(self) -> SchedulerOutput:
                     for i in encoder_inputs_to_schedule:
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_compute_budget = new_encoder_compute_budget
-
+                # Allocate for external load encoder cache
+                if external_load_encoder_input:
+                    for i in external_load_encoder_input:
+                        self.encoder_cache_manager.allocate(request, i)
+                        if self.ec_connector is not None:
+                            self.ec_connector.update_state_after_alloc(request, i)
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
             self.waiting.prepend_requests(skipped_waiting_requests)
@@ -587,6 +633,7 @@ def schedule(self) -> SchedulerOutput:
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
+
         assert token_budget >= 0
         assert len(self.running) <= self.max_num_running_reqs
         # Since some requests in the RUNNING queue may not be scheduled in
@@ -599,13 +646,14 @@ def schedule(self) -> SchedulerOutput:
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.
         num_common_prefix_blocks = [0] * len(self.kv_cache_config.kv_cache_groups)
-        if self.running:
-            any_request = self.running[0]
-            num_common_prefix_blocks = (
-                self.kv_cache_manager.get_num_common_prefix_blocks(
-                    any_request.request_id
+        with record_function_or_nullcontext("schedule: get_num_common_prefix_blocks"):
+            if self.running:
+                any_request = self.running[0]
+                num_common_prefix_blocks = (
+                    self.kv_cache_manager.get_num_common_prefix_blocks(
+                        any_request.request_id
+                    )
                 )
-            )
 
         # Construct the scheduler output.
         new_reqs_data = [
@@ -614,13 +662,14 @@ def schedule(self) -> SchedulerOutput:
             )
             for req in scheduled_new_reqs
         ]
-        cached_reqs_data = self._make_cached_request_data(
-            scheduled_running_reqs,
-            scheduled_resumed_reqs,
-            num_scheduled_tokens,
-            scheduled_spec_decode_tokens,
-            req_to_new_blocks,
-        )
+        with record_function_or_nullcontext("schedule: make_cached_request_data"):
+            cached_reqs_data = self._make_cached_request_data(
+                scheduled_running_reqs,
+                scheduled_resumed_reqs,
+                num_scheduled_tokens,
+                scheduled_spec_decode_tokens,
+                req_to_new_blocks,
+            )
 
         # Record the request ids that were scheduled in this step.
         self.prev_step_scheduled_req_ids.clear()
@@ -647,10 +696,20 @@ def schedule(self) -> SchedulerOutput:
         # 2. Wrap up all the KV cache load / save ops into an opaque object
         # 3. Clear the internal states of the connector
         if self.connector is not None:
-            meta = self.connector.build_connector_meta(scheduler_output)
+            meta: KVConnectorMetadata = self.connector.build_connector_meta(
+                scheduler_output
+            )
             scheduler_output.kv_connector_metadata = meta
 
-        self._update_after_schedule(scheduler_output)
+        # Build the connector meta for ECConnector
+        if self.ec_connector is not None:
+            ec_meta: ECConnectorMetadata = self.ec_connector.build_connector_meta(
+                scheduler_output
+            )
+            scheduler_output.ec_connector_metadata = ec_meta
+
+        with record_function_or_nullcontext("schedule: update_after_schedule"):
+            self._update_after_schedule(scheduler_output)
         return scheduler_output
 
     def _update_after_schedule(
@@ -722,9 +781,7 @@ def _make_cached_request_data(
                 assert not scheduled_in_prev_step
                 resumed_req_ids.add(req_id)
             if not scheduled_in_prev_step:
-                all_token_ids[req_id] = req.all_token_ids[
-                    : req.num_computed_tokens + num_tokens
-                ]
+                all_token_ids[req_id] = req.all_token_ids.copy()
             new_block_ids.append(
                 req_to_new_blocks[req_id].get_block_ids(allow_none=True)
             )
@@ -749,7 +806,7 @@ def _try_schedule_encoder_inputs(
         num_computed_tokens: int,
         num_new_tokens: int,
         encoder_compute_budget: int,
-    ) -> tuple[list[int], int, int]:
+    ) -> tuple[list[int], int, int, list[int]]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
         and update `num_new_tokens` and encoder token budget accordingly.
@@ -759,6 +816,7 @@ def _try_schedule_encoder_inputs(
         in this step, i.e.,
         [num_computed_tokens, num_computed_tokens + num_new_tokens).
         - It is not already computed and stored in the encoder cache.
+        - It is not exist on remote encoder cache (via ECConnector)
         - There is sufficient encoder token budget to process it.
         - The encoder cache has space to store it.
 
@@ -770,12 +828,16 @@ def _try_schedule_encoder_inputs(
         blocks and externally cached blocks (via KVConnector).
         """
         if num_new_tokens == 0 or not request.has_encoder_inputs:
-            return [], num_new_tokens, encoder_compute_budget
+            return [], num_new_tokens, encoder_compute_budget, []
         encoder_inputs_to_schedule: list[int] = []
         mm_features = request.mm_features
         assert mm_features is not None
         assert len(mm_features) > 0
+        external_load_encoder_input = []
 
+        # Check remote cache first
+        if self.ec_connector is not None:
+            remote_cache_has_item = self.ec_connector.has_caches(request)
         # NOTE: since scheduler operates on the request level (possibly with
         # multiple encoder inputs per request), we need to create temporary
         # trackers for accounting at the encoder input level.
@@ -856,6 +918,12 @@ def _try_schedule_encoder_inputs(
                     num_new_tokens = 0
                 break
 
+            if self.ec_connector is not None and remote_cache_has_item[i]:
+                mm_hashes_to_schedule.add(request.mm_features[i].identifier)
+                external_load_encoder_input.append(i)
+                num_tokens_to_schedule += num_encoder_tokens
+                continue
+
             num_tokens_to_schedule += num_encoder_tokens
             encoder_compute_budget -= num_encoder_tokens
             mm_hashes_to_schedule.add(request.mm_features[i].identifier)
@@ -865,6 +933,7 @@ def _try_schedule_encoder_inputs(
             encoder_inputs_to_schedule,
             num_new_tokens,
             encoder_compute_budget,
+            external_load_encoder_input,
         )
 
     def get_grammar_bitmask(
@@ -942,8 +1011,8 @@ def update_from_output(
                 continue
 
             req_index = model_runner_output.req_id_to_index[req_id]
-            generated_token_ids = (
-                sampled_token_ids[req_index] if sampled_token_ids else []
+            generated_token_ids: list[int] = (
+                sampled_token_ids[req_index].tolist() if sampled_token_ids else []
             )
 
             scheduled_spec_token_ids = (
@@ -958,7 +1027,12 @@ def update_from_output(
                 # tokens and rejections. If some tokens are rejected,
                 # num_computed_tokens is decreased by the number of rejected
                 # tokens.
-                request.num_computed_tokens -= num_rejected
+                if request.num_computed_tokens > 0:
+                    request.num_computed_tokens -= num_rejected
+                # If async scheduling, num_output_placeholders also includes
+                # the scheduled spec tokens count and so is similarly adjusted.
+                if request.num_output_placeholders > 0:
+                    request.num_output_placeholders -= num_rejected
                 spec_decoding_stats = self.make_spec_decoding_stats(
                     spec_decoding_stats,
                     num_draft_tokens=num_draft_tokens,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 058a4bcaecb5..3f621d77c024 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -15,6 +15,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+from vllm.v1.serial_utils import UtilityResult
 
 # These are possible values of RequestOutput.finish_reason,
 # so form part of the external API.
@@ -131,13 +132,6 @@ def finished(self) -> bool:
         return self.finish_reason is not None
 
 
-class UtilityResult:
-    """Wrapper for special handling when serializing/deserializing."""
-
-    def __init__(self, r: Any = None):
-        self.result = r
-
-
 class UtilityOutput(
     msgspec.Struct,
     array_like=True,  # type: ignore[call-arg]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index aee21fb3fffe..c160c7cbcab4 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -14,7 +14,7 @@
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.protocol import Device, EngineClient
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
@@ -120,8 +120,9 @@ def __init__(
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
+        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats
+            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
@@ -671,9 +672,7 @@ async def reset_mm_cache(self) -> None:
         self.processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
-    async def reset_prefix_cache(self, device: Device | None = None) -> None:
-        if device == Device.CPU:
-            raise ValueError("Not supported on CPU.")
+    async def reset_prefix_cache(self) -> None:
         await self.engine_core.reset_prefix_cache_async()
 
     async def sleep(self, level: int = 1) -> None:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index fba018432e0a..d49eb752d56a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
 import os
 import queue
 import signal
@@ -27,7 +26,10 @@
 from vllm.multimodal.cache import engine_receiver_cache_from_config
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
-from vllm.utils.gc_utils import maybe_attach_gc_debug_callback
+from vllm.utils.gc_utils import (
+    freeze_gc_heap,
+    maybe_attach_gc_debug_callback,
+)
 from vllm.utils.hashing import get_hash_fn_by_name
 from vllm.utils.network_utils import make_zmq_socket
 from vllm.utils.system_utils import decorate_logs, set_process_title
@@ -121,7 +123,7 @@ def __init__(
             # Encoder models without KV cache don't support
             # chunked prefill. But do SSM models?
             logger.info("Disabling chunked prefill for model without KVCache")
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.enable_chunked_prefill = False
 
         scheduler_block_size = (
             vllm_config.cache_config.block_size
@@ -178,11 +180,14 @@ def __init__(
             logger.info("Batch queue is enabled with size %d", self.batch_queue_size)
             self.batch_queue = deque(maxlen=self.batch_queue_size)
 
+        self.ec_producer = (
+            vllm_config.ec_transfer_config is not None
+            and vllm_config.ec_transfer_config.is_ec_producer
+        )
+        self.is_pooling_model = vllm_config.model_config.runner_type == "pooling"
+
         self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None
-        if (
-            self.vllm_config.cache_config.enable_prefix_caching
-            or kv_connector is not None
-        ):
+        if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None:
             caching_hash_fn = get_hash_fn_by_name(
                 vllm_config.cache_config.prefix_caching_hash_algo
             )
@@ -195,6 +200,11 @@ def __init__(
         self.step_fn = (
             self.step if self.batch_queue is None else self.step_with_batch_queue
         )
+        self.async_scheduling = vllm_config.scheduler_config.async_scheduling
+
+        # Mark the startup heap as static so that it's ignored by GC.
+        # Reduces pause times of oldest generation collections.
+        freeze_gc_heap()
 
     def _initialize_kv_caches(
         self, vllm_config: VllmConfig
@@ -238,7 +248,7 @@ def _initialize_kv_caches(
 
         elapsed = time.time() - start
         logger.info_once(
-            ("init engine (profile, create kv cache, warmup model) took %.2f seconds"),
+            "init engine (profile, create kv cache, warmup model) took %.2f seconds",
             elapsed,
             scope="local",
         )
@@ -304,6 +314,16 @@ def log_error_detail(self, scheduler_output: SchedulerOutput):
             )
             raise err
 
+    def _log_err_callback(self, scheduler_output: SchedulerOutput):
+        """Log error details of a future that's not expected to return a result."""
+
+        def callback(f, sched_output=scheduler_output):
+            with self.log_error_detail(sched_output):
+                result = f.result()
+                assert result is None
+
+        return callback
+
     def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         """Schedule, execute, and make output.
 
@@ -330,7 +350,10 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0
 
     def post_step(self, model_executed: bool) -> None:
-        if self.use_spec_decode and model_executed:
+        # When using async scheduling we can't get draft token ids in advance,
+        # so we update draft token ids in the worker process and don't
+        # need to update draft token ids here.
+        if not self.async_scheduling and self.use_spec_decode and model_executed:
             # Take the draft token ids.
             draft_token_ids = self.model_executor.take_draft_token_ids()
             if draft_token_ids is not None:
@@ -367,31 +390,30 @@ def step_with_batch_queue(
             exec_future = self.model_executor.execute_model(
                 scheduler_output, non_block=True
             )
-            model_executed = scheduler_output.total_num_scheduled_tokens > 0
-
-            if scheduler_output.pending_structured_output_tokens:
-                # We need to defer sampling until we have processed the model output
-                # from the prior step.
-                deferred_scheduler_output = scheduler_output
-                # Block-wait for execute to return (continues running async on the GPU).
-                with self.log_error_detail(scheduler_output):
-                    exec_result = exec_future.result()
-                    assert exec_result is None
+            if not self.ec_producer:
+                model_executed = scheduler_output.total_num_scheduled_tokens > 0
+
+            if self.is_pooling_model or not model_executed:
+                # No sampling required (no requests scheduled).
+                future = cast(Future[ModelRunnerOutput], exec_future)
             else:
-                # We aren't waiting for any tokens, get any grammar output immediately.
-                grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
-                # Block-wait for execute to return (continues running async on the GPU).
-                with self.log_error_detail(scheduler_output):
-                    exec_result = exec_future.result()
-
-                if exec_result is None:
-                    # Call sample tokens.
+                exec_future.add_done_callback(self._log_err_callback(scheduler_output))
+
+                if not scheduler_output.pending_structured_output_tokens:
+                    # We aren't waiting for any tokens, get any grammar output
+                    # and sample immediately.
+                    grammar_output = self.scheduler.get_grammar_bitmask(
+                        scheduler_output
+                    )
                     future = self.model_executor.sample_tokens(
                         grammar_output, non_block=True
                     )
                 else:
-                    # No sampling required (e.g. all requests finished).
-                    future = cast(Future[ModelRunnerOutput], exec_future)
+                    # We need to defer sampling until we have processed the model output
+                    # from the prior step.
+                    deferred_scheduler_output = scheduler_output
+
+            if not deferred_scheduler_output:
                 # Add this step's future to the queue.
                 batch_queue.appendleft((future, scheduler_output))
                 if (
@@ -622,11 +644,6 @@ def __init__(
                 assert addresses.coordinator_input is not None
                 logger.info("Waiting for READY message from DP Coordinator...")
 
-        # Mark the startup heap as static so that it's ignored by GC.
-        # Reduces pause times of oldest generation collections.
-        gc.collect()
-        gc.freeze()
-
         # If enable, attach GC debugger after static variable freeze.
         maybe_attach_gc_debug_callback()
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index e32c74aff313..e403cea87788 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -14,7 +14,6 @@
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.protocol import Device
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -36,6 +35,7 @@
 from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
 from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.utils import record_function_or_nullcontext
 from vllm.v1.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
@@ -95,8 +95,9 @@ def __init__(
         )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats
+            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
@@ -280,28 +281,32 @@ def step(self) -> list[RequestOutput | PoolingRequestOutput]:
             return []
 
         # 1) Get EngineCoreOutput from the EngineCore.
-        outputs = self.engine_core.get_output()
+        with record_function_or_nullcontext("llm_engine step: get_output"):
+            outputs = self.engine_core.get_output()
 
         # 2) Process EngineCoreOutputs.
-        iteration_stats = IterationStats() if self.log_stats else None
-        processed_outputs = self.output_processor.process_outputs(
-            outputs.outputs,
-            engine_core_timestamp=outputs.timestamp,
-            iteration_stats=iteration_stats,
-        )
-        self.output_processor.update_scheduler_stats(outputs.scheduler_stats)
+        with record_function_or_nullcontext("llm_engine step: process_outputs"):
+            iteration_stats = IterationStats() if self.log_stats else None
+            processed_outputs = self.output_processor.process_outputs(
+                outputs.outputs,
+                engine_core_timestamp=outputs.timestamp,
+                iteration_stats=iteration_stats,
+            )
+            self.output_processor.update_scheduler_stats(outputs.scheduler_stats)
 
         # 3) Abort any reqs that finished due to stop strings.
-        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
+        with record_function_or_nullcontext("llm_engine step: abort_requests"):
+            self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
         # 4) Record stats
-        if self.logger_manager is not None and outputs.scheduler_stats is not None:
-            self.logger_manager.record(
-                scheduler_stats=outputs.scheduler_stats,
-                iteration_stats=iteration_stats,
-                mm_cache_stats=self.processor.stat_mm_cache(),
-            )
-            self.do_log_stats_with_interval()
+        with record_function_or_nullcontext("llm_engine step: record_stats"):
+            if self.logger_manager is not None and outputs.scheduler_stats is not None:
+                self.logger_manager.record(
+                    scheduler_stats=outputs.scheduler_stats,
+                    iteration_stats=iteration_stats,
+                    mm_cache_stats=self.processor.stat_mm_cache(),
+                )
+                self.do_log_stats_with_interval()
 
         return processed_outputs.request_outputs
 
@@ -315,7 +320,7 @@ def reset_mm_cache(self):
         self.processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, device: Device | None = None):
+    def reset_prefix_cache(self):
         self.engine_core.reset_prefix_cache()
 
     def sleep(self, level: int = 1):
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 4c5955d7ee2e..b618d2347265 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -74,7 +74,12 @@ def _update_sample_logprobs(self, logprobs_lists: LogprobsLists) -> None:
 
         token_ids_lst, logprobs_lst, ranks_lst, _ = logprobs_lists
 
-        for rank, logprobs, token_ids in zip(ranks_lst, logprobs_lst, token_ids_lst):
+        for rank_np, logprobs_np, token_ids_np in zip(
+            ranks_lst, logprobs_lst, token_ids_lst
+        ):
+            rank = rank_np.tolist()
+            logprobs = logprobs_np.tolist()
+            token_ids = token_ids_np.tolist()
             # Detokenize (non-incrementally).
             decoded_tokens = (
                 NONES
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index d8d03f19d466..bdbbfe2595f8 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -104,6 +104,7 @@ def __init__(
         arrival_time: float,
         queue: RequestOutputCollector | None,
         log_stats: bool,
+        stream_interval: int,
         top_p: float | None = None,
         n: int | None = None,
         temperature: float | None = None,
@@ -131,6 +132,10 @@ def __init__(
 
         self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None
 
+        # Stream Interval
+        self.stream_interval = stream_interval
+        self.sent_tokens_offset = 0  # Offset of sent tokens
+
     @classmethod
     def from_new_request(
         cls,
@@ -141,6 +146,7 @@ def from_new_request(
         request_index: int,
         queue: RequestOutputCollector | None,
         log_stats: bool,
+        stream_interval: int,
     ) -> "RequestState":
         if sampling_params := request.sampling_params:
             if not sampling_params.detokenize:
@@ -188,6 +194,7 @@ def from_new_request(
             arrival_time=request.arrival_time,
             queue=queue,
             log_stats=log_stats,
+            stream_interval=stream_interval,
         )
 
     def make_request_output(
@@ -205,6 +212,29 @@ def make_request_output(
             # Only the final output is required in FINAL_ONLY mode.
             return None
 
+        if self.stream_interval > 1:
+            assert self.detokenizer is not None
+
+            # Send output request only when
+            # 1. It has finished, or
+            # 2. It is the first token, or
+            # 3. It has reached the stream interval number of tokens
+            if not (
+                finished
+                or self.sent_tokens_offset == 0
+                or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset
+                >= self.stream_interval
+            ):
+                return None
+
+            if self.output_kind == RequestOutputKind.DELTA:
+                # Send tokens from the offset in DELTA mode, otherwise all
+                # tokens are sent.
+                new_token_ids = self.detokenizer.output_token_ids[
+                    self.sent_tokens_offset :
+                ]
+                self.sent_tokens_offset = len(self.detokenizer.output_token_ids)
+
         request_id = self.request_id
         if pooling_output is not None:
             return self._new_request_output(
@@ -310,9 +340,12 @@ def _new_pooling_output(
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
-    def __init__(self, tokenizer: AnyTokenizer, log_stats: bool):
+    def __init__(
+        self, tokenizer: AnyTokenizer, log_stats: bool, stream_interval: int = 1
+    ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
+        self.stream_interval = stream_interval
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates(log_stats)
@@ -385,6 +418,7 @@ def add_request(
             request_index=request_index,
             queue=queue,
             log_stats=self.log_stats,
+            stream_interval=self.stream_interval,
         )
         self.request_states[request_id] = req_state
         if parent_req:
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 26ee10d2b9bb..59aacd196307 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -97,12 +97,21 @@ def get_outputs(
         child_request_id: str,
         completion_output: CompletionOutput,
     ) -> tuple[str, list[CompletionOutput], bool]:
+        already_finished_and_returned: bool = False
         if completion_output.finished():
-            self.child_requests.remove(child_request_id)
+            if child_request_id in self.child_requests:
+                self.child_requests.remove(child_request_id)
+            else:
+                # child request ID is not available in child_requests
+                # which means the request had finished in previous
+                # batch step and returned to the client earlier
+                already_finished_and_returned = True
 
         if self.sampling_params.output_kind != RequestOutputKind.FINAL_ONLY:
-            # If streaming, just return the current output.
-            outputs = [completion_output]
+            # If streaming, just return the current output
+            #
+            # DO NOT output finished and already returned child request to client again
+            outputs = [] if already_finished_and_returned else [completion_output]
         else:
             # If not streaming, aggregate the n final outputs.
             self.output_aggregator[completion_output.index] = completion_output
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index c49fd1bde8b9..4cb911d8e22b 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -14,6 +14,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import processor_cache_from_config
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
@@ -149,6 +150,23 @@ def _validate_supported_sampling_params(
             raise ValueError(
                 "vLLM V1 does not support per request user provided logits processors."
             )
+        # Async scheduling + spec decode currently incompatible with some
+        # sampling parameters.
+        if (
+            self.vllm_config.speculative_config is not None
+            and self.vllm_config.scheduler_config.async_scheduling
+            and (
+                params.frequency_penalty != 0.0
+                or params.presence_penalty != 0.0
+                or params.repetition_penalty != 1.0
+                or params.bad_words_token_ids
+                or params.structured_outputs
+            )
+        ):
+            raise ValueError(
+                "async scheduling with spec decoding doesn't yet support "
+                "penalties, bad words or structured outputs in sampling parameters."
+            )
 
     def _validate_params(
         self,
@@ -270,6 +288,12 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             raise ValueError(
                 f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
             )
+        # Reject empty string grammar early to avoid engine-side crashes
+        if (
+            isinstance(params.structured_outputs.grammar, str)
+            and params.structured_outputs.grammar.strip() == ""
+        ):
+            raise ValueError("structured_outputs.grammar cannot be an empty string")
 
         if backend.startswith("xgrammar"):
             # xgrammar with no fallback
@@ -334,7 +358,12 @@ def _extract_mm_data(p: PromptType):
 
         mm_uuids: dict[str, list[str | None] | str] = {}
         for modality, data in mm_data.items():
-            n = len(data) if isinstance(data, list) else 1
+            # Hash each item for embedding inputs.
+            n = (
+                len(data)
+                if isinstance(data, list) or MultiModalDataParser.is_embeddings(data)
+                else 1
+            )
             mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
         return mm_uuids
 
@@ -569,6 +598,22 @@ def _validate_model_input(
             # check that chunked prefill does not truncate them
             # max_batch_len = self.scheduler_config.max_num_batched_tokens
 
+        if (
+            prompt_len == max_prompt_len
+            and prompt_type == "decoder"
+            and not model_config.is_multimodal_model
+            and self.model_config.runner_type != "pooling"
+        ):
+            suggestion = (
+                "Make sure that `max_model_len` is no smaller than the "
+                "number of text tokens (prompt + requested output tokens)."
+            )
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than the maximum "
+                f"model length of {max_prompt_len}. {suggestion}"
+            )
+
     def stat_mm_cache(self) -> MultiModalCacheStats | None:
         return self.input_preprocessor.stat_mm_cache()
 
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index e74519b21aa6..d65cad7af03d 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -183,15 +183,19 @@ def set_device_control_env_var(
     for engine subprocess.
     """
     world_size = vllm_config.parallel_config.world_size
+    local_world_size = vllm_config.parallel_config.local_world_size
     evar = current_platform.device_control_env_var
 
-    value = get_device_indices(evar, local_dp_rank, world_size)
+    value = get_device_indices(evar, local_dp_rank, world_size, local_world_size)
     with patch.dict(os.environ, values=((evar, value),)):
         yield
 
 
 def get_device_indices(
-    device_control_env_var: str, local_dp_rank: int, world_size: int
+    device_control_env_var: str,
+    local_dp_rank: int,
+    world_size: int,
+    local_world_size: int | None = None,
 ):
     """
     Returns a comma-separated string of device indices for the specified
@@ -200,10 +204,15 @@ def get_device_indices(
     For example, if world_size=2 and local_dp_rank=1, and there are 4 devices,
     this will select devices 2 and 3 for local_dp_rank=1.
     """
+    if local_world_size is None:
+        local_world_size = world_size
     try:
         value = ",".join(
             str(current_platform.device_id_to_physical_device_id(i))
-            for i in range(local_dp_rank * world_size, (local_dp_rank + 1) * world_size)
+            for i in range(
+                local_dp_rank * world_size,
+                local_dp_rank * world_size + local_world_size,
+            )
         )
     except IndexError as e:
         raise Exception(
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 1e913876b763..db8303fcec50 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -294,12 +294,6 @@ def reset_mm_cache(self) -> None:
         """Reset the multi-modal cache in each worker."""
         self.collective_rpc("reset_mm_cache")
 
-    def start_profile(self) -> None:
-        self.collective_rpc("start_profile")
-
-    def stop_profile(self) -> None:
-        self.collective_rpc("stop_profile")
-
     def sleep(self, level: int = 1):
         if self.is_sleeping:
             logger.warning("Executor is already sleeping.")
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 1e249161c688..ad2ece50f981 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -10,7 +10,7 @@
 import traceback
 import weakref
 from collections import deque
-from collections.abc import Callable
+from collections.abc import Callable, Sequence
 from concurrent.futures import Future, InvalidStateError
 from contextlib import suppress
 from dataclasses import dataclass
@@ -31,8 +31,10 @@
 from vllm.distributed.device_communicators.shm_broadcast import Handle, MessageQueue
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.distributed.parallel_state import (
+    get_dcp_group,
     get_dp_group,
     get_ep_group,
+    get_inner_dp_world_group,
     get_pp_group,
     get_tp_group,
 )
@@ -89,6 +91,10 @@ def wait_for_response(self, get_response: Callable):
 class MultiprocExecutor(Executor):
     supports_pp: bool = True
 
+    def __init__(self, vllm_config: VllmConfig, monitor_workers: bool = True):
+        self.monitor_workers = monitor_workers
+        super().__init__(vllm_config)
+
     def _init_executor(self) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
@@ -98,6 +104,12 @@ def _init_executor(self) -> None:
         self.failure_callback: FailureCallback | None = None
 
         self.world_size = self.parallel_config.world_size
+        assert self.world_size % self.parallel_config.nnodes_within_dp == 0, (
+            f"global world_size ({self.parallel_config.world_size}) must be "
+            f"divisible by nnodes_within_dp "
+            f"({self.parallel_config.nnodes_within_dp}). "
+        )
+        self.local_world_size = self.parallel_config.local_world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
         pp_parallel_size = self.parallel_config.pipeline_parallel_size
         assert self.world_size == tensor_parallel_size * pp_parallel_size, (
@@ -115,27 +127,37 @@ def _init_executor(self) -> None:
         distributed_init_method = get_distributed_init_method(
             get_loopback_ip(), get_open_port()
         )
-
+        self.rpc_broadcast_mq: MessageQueue | None = None
+        scheduler_output_handle: Handle | None = None
         # Initialize worker and set up message queues for SchedulerOutputs
         # and ModelRunnerOutputs
-        max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
-        self.rpc_broadcast_mq = MessageQueue(
-            self.world_size, self.world_size, max_chunk_bytes=max_chunk_bytes
-        )
-        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
-
+        if self.parallel_config.node_rank_within_dp == 0:
+            # For leader node within each dp rank,
+            # each dp will have its own leader multiproc executor.
+            max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+            self.rpc_broadcast_mq = MessageQueue(
+                self.world_size,
+                self.local_world_size,
+                max_chunk_bytes=max_chunk_bytes,
+                connect_ip=self.parallel_config.master_addr,
+            )
+            scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
         # Create workers
         context = get_mp_context()
         shared_worker_lock = context.Lock()
         unready_workers: list[UnreadyWorkerProcHandle] = []
         success = False
         try:
-            for rank in range(self.world_size):
+            global_start_rank = (
+                self.local_world_size * self.parallel_config.node_rank_within_dp
+            )
+            for local_rank in range(self.local_world_size):
+                global_rank = global_start_rank + local_rank
                 unready_workers.append(
                     WorkerProc.make_worker_process(
                         vllm_config=self.vllm_config,
-                        local_rank=rank,
-                        rank=rank,
+                        local_rank=local_rank,
+                        rank=global_rank,
                         distributed_init_method=distributed_init_method,
                         input_shm_handle=scheduler_output_handle,
                         shared_worker_lock=shared_worker_lock,
@@ -144,15 +166,38 @@ def _init_executor(self) -> None:
 
             # Workers must be created before wait_for_ready to avoid
             # deadlock, since worker.init_device() does a device sync.
+
+            # Wait for all local workers to be ready.
             self.workers = WorkerProc.wait_for_ready(unready_workers)
 
+            # Start background thread to monitor worker health if not in headless mode.
+            if self.monitor_workers:
+                self.start_worker_monitor()
+
+            self.response_mqs = []
+            # Only leader node have remote response mqs
+            if self.parallel_config.node_rank_within_dp == 0:
+                for rank in range(self.world_size):
+                    if rank < self.local_world_size:
+                        local_message_queue = self.workers[rank].worker_response_mq
+                        assert local_message_queue is not None
+                        self.response_mqs.append(local_message_queue)
+                    else:
+                        remote_message_queue = self.workers[0].peer_worker_response_mqs[
+                            rank
+                        ]
+                        assert remote_message_queue is not None
+                        self.response_mqs.append(remote_message_queue)
+
             # Ensure message queues are ready. Will deadlock if re-ordered
             # Must be kept consistent with the WorkerProc.
-            self.rpc_broadcast_mq.wait_until_ready()
-            for w in self.workers:
-                w.worker_response_mq.wait_until_ready()
 
-            self.start_worker_monitor()
+            # Wait for all input mqs to be ready.
+            if self.rpc_broadcast_mq is not None:
+                self.rpc_broadcast_mq.wait_until_ready()
+            # Wait for all remote response mqs to be ready.
+            for response_mq in self.response_mqs:
+                response_mq.wait_until_ready()
             success = True
         finally:
             if not success:
@@ -167,7 +212,7 @@ def _init_executor(self) -> None:
 
         self.output_rank = self._get_output_rank()
 
-    def start_worker_monitor(self):
+    def start_worker_monitor(self, inline=False) -> None:
         workers = self.workers
         self_ref = weakref.ref(self)
 
@@ -191,9 +236,13 @@ def monitor_workers():
                 _self.failure_callback = None
                 callback()
 
-        Thread(
-            target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor"
-        ).start()
+        if not inline:
+            Thread(
+                target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor"
+            ).start()
+            return
+
+        monitor_workers()
 
     def register_failure_callback(self, callback: FailureCallback):
         if self.is_failed:
@@ -246,7 +295,9 @@ def collective_rpc(  # type: ignore[override]
     ) -> Any | list[Any] | Future[Any | list[Any]]:
         """Returns single result if unique_reply_rank and/or kv_output_aggregator
         is provided, otherwise list."""
-
+        assert self.rpc_broadcast_mq is not None, (
+            "collective_rpc should not be called on follower node"
+        )
         if self.is_failed:
             raise RuntimeError("Executor failed.")
 
@@ -268,20 +319,20 @@ def collective_rpc(  # type: ignore[override]
             send_method = cloudpickle.dumps(method, protocol=pickle.HIGHEST_PROTOCOL)
         self.rpc_broadcast_mq.enqueue((send_method, args, kwargs, output_rank))
 
-        workers = (
-            (self.workers[output_rank],) if output_rank is not None else self.workers
-        )
+        response_mqs: Sequence[MessageQueue] = self.response_mqs
+        if output_rank is not None:
+            response_mqs = (response_mqs[output_rank],)
 
         shutdown_event = self.shutdown_event
 
         def get_response():
             responses = []
-            for w in workers:
+            for mq in response_mqs:
                 dequeue_timeout = (
                     None if deadline is None else (deadline - time.monotonic())
                 )
                 try:
-                    status, result = w.worker_response_mq.dequeue(
+                    status, result = mq.dequeue(
                         timeout=dequeue_timeout, cancel=shutdown_event
                     )
                 except TimeoutError as e:
@@ -390,17 +441,26 @@ class UnreadyWorkerProcHandle:
 class WorkerProcHandle:
     proc: BaseProcess
     rank: int
-    worker_response_mq: MessageQueue  # The worker process writes to this MQ
+    # The worker process writes to this MQ in single-node mode
+    worker_response_mq: MessageQueue | None
+    # This is only non empty on driver node,
+    # the peer worker process i writes to MQ
+    # `peer_worker_response_mqs[i]`
+    peer_worker_response_mqs: list[MessageQueue | None]
     death_writer: Connection | None = None
 
     @classmethod
     def from_unready_handle(
-        cls, unready_handle: UnreadyWorkerProcHandle, worker_response_mq: MessageQueue
+        cls,
+        unready_handle: UnreadyWorkerProcHandle,
+        worker_response_mq: MessageQueue | None,
+        peer_worker_response_mqs: list[MessageQueue | None],
     ) -> "WorkerProcHandle":
         return cls(
             proc=unready_handle.proc,
             rank=unready_handle.rank,
             worker_response_mq=worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
             death_writer=unready_handle.death_writer,
         )
 
@@ -410,6 +470,38 @@ class WorkerProc:
 
     READY_STR = "READY"
 
+    def _init_message_queues(
+        self, input_shm_handle: Handle, vllm_config: VllmConfig
+    ) -> None:
+        if vllm_config.parallel_config.nnodes_within_dp == 1:
+            # Initialize MessageQueue for receiving SchedulerOutput
+            self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+                input_shm_handle, self.worker.rank
+            )
+
+            # Initializes a message queue for sending the model output
+            self.worker_response_mq: MessageQueue = MessageQueue(1, 1)
+            self.peer_response_handles = []
+        else:
+            # Initialize remote MessageQueue for receiving SchedulerOutput across nodes
+            self.rpc_broadcast_mq = get_inner_dp_world_group().create_mq_broadcaster(
+                external_writer_handle=input_shm_handle,
+                # Since there is external_writer_handle from executor proc,
+                # where the ready signal from actual writer is sent out of the
+                # create_mq_broadcaster method and after this setup, we make it
+                # non blocking. The handshake will be triggered when
+                # worker.rpc_broadcast_mq.wait_until_ready() is called
+                blocking=False,
+            )
+            # Initializes remote message queue for sending the model output to the
+            # driver worker, exposing peer_response_handles for driver worker
+            # that include handles for all ranks
+            self.worker_response_mq, self.peer_response_handles = (
+                get_inner_dp_world_group().create_single_reader_mq_broadcasters(
+                    reader_rank_in_group=0
+                )
+            )
+
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -420,13 +512,15 @@ def __init__(
         shared_worker_lock: LockType,
     ):
         self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
+        wrapper = WorkerWrapperBase(
+            vllm_config=vllm_config, rpc_rank=local_rank, global_rank=rank
+        )
         # TODO: move `init_worker` to executor level as a collective rpc call
         all_kwargs: list[dict] = [
             {} for _ in range(vllm_config.parallel_config.world_size)
         ]
         is_driver_worker = rank % vllm_config.parallel_config.tensor_parallel_size == 0
-        all_kwargs[rank] = {
+        all_kwargs[local_rank] = {
             "vllm_config": vllm_config,
             "local_rank": local_rank,
             "rank": rank,
@@ -437,14 +531,6 @@ def __init__(
         wrapper.init_worker(all_kwargs)
         self.worker = wrapper
 
-        # Initialize MessageQueue for receiving SchedulerOutput
-        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
-            input_shm_handle, self.worker.rank
-        )
-
-        # Initializes a message queue for sending the model output
-        self.worker_response_mq = MessageQueue(1, 1)
-
         scheduler_config = vllm_config.scheduler_config
         self.use_async_scheduling = scheduler_config.async_scheduling
         if self.use_async_scheduling:
@@ -465,6 +551,7 @@ def __init__(
         )
 
         # Load model
+        self._init_message_queues(input_shm_handle, vllm_config)
         self.worker.load_model()
 
         # Enable environment variable cache (e.g. assume no more
@@ -511,6 +598,27 @@ def make_worker_process(
         # death_reader in child will get EOFError
         return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
 
+    @staticmethod
+    def wait_for_response_handle_ready(
+        handles: dict[str, Any], proc_handle: UnreadyWorkerProcHandle
+    ) -> WorkerProcHandle:
+        response_handle = handles["handle"]
+        worker_response_mq: MessageQueue | None = None
+        if len(response_handle.local_reader_ranks) > 0:
+            worker_response_mq = MessageQueue.create_from_handle(response_handle, 0)
+        peer_response_handles = handles["peer_response_handles"]
+        peer_worker_response_mqs = [
+            MessageQueue.create_from_handle(handle, -1)
+            if handle.remote_subscribe_addr is not None
+            else None
+            for handle in peer_response_handles
+        ]
+        return WorkerProcHandle.from_unready_handle(
+            proc_handle,
+            worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
+        )
+
     @staticmethod
     def wait_for_ready(
         unready_proc_handles: list[UnreadyWorkerProcHandle],
@@ -536,16 +644,10 @@ def wait_for_ready(
                     if response["status"] != "READY":
                         raise e
 
-                    # Extract the message queue handle.
-                    worker_response_mq = MessageQueue.create_from_handle(
-                        response["handle"], 0
-                    )
-                    ready_proc_handles[unready_proc_handle.rank] = (
-                        WorkerProcHandle.from_unready_handle(
-                            unready_proc_handle, worker_response_mq
-                        )
+                    idx = unready_proc_handle.rank % len(ready_proc_handles)
+                    ready_proc_handles[idx] = WorkerProc.wait_for_response_handle_ready(
+                        response, unready_proc_handle
                     )
-
                 except EOFError:
                     e.__suppress_context__ = True
                     raise e from None
@@ -617,12 +719,14 @@ def monitor_parent_death():
                 {
                     "status": WorkerProc.READY_STR,
                     "handle": worker.worker_response_mq.export_handle(),
+                    "peer_response_handles": worker.peer_response_handles,
                 }
             )
 
             # Ensure message queues are ready. Will deadlock if re-ordered.
             # Must be kept consistent with the Executor
-            worker.rpc_broadcast_mq.wait_until_ready()
+            if worker.rpc_broadcast_mq is not None:
+                worker.rpc_broadcast_mq.wait_until_ready()
             worker.worker_response_mq.wait_until_ready()
             ready_writer.close()
             ready_writer = None
@@ -726,6 +830,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
         pp_rank = get_pp_group().rank_in_group
         tp_size = get_tp_group().world_size
         tp_rank = get_tp_group().rank_in_group
+        dcp_size = get_dcp_group().world_size
+        dcp_rank = get_dcp_group().rank_in_group
         process_name = "Worker"
         if dp_size > 1:
             process_name += f"_DP{dp_rank}"
@@ -733,6 +839,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
             process_name += f"_PP{pp_rank}"
         if tp_size > 1:
             process_name += f"_TP{tp_rank}"
+        if dcp_size > 1:
+            process_name += f"_DCP{dcp_rank}"
         if enable_ep:
             ep_rank = get_ep_group().rank_in_group
             process_name += f"_EP{ep_rank}"
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 119e4c081831..406eafcd339b 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -99,6 +99,11 @@ def _init_executor(self) -> None:
         # KV connector setup
         self.has_connector = self.vllm_config.kv_transfer_config is not None
 
+        self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and (
+            self.vllm_config.ec_transfer_config is None
+            or not self.vllm_config.ec_transfer_config.is_ec_producer
+        )
+
         self.scheduler_output: SchedulerOutput | None = None
 
     @property
@@ -395,6 +400,12 @@ def execute_model(  # type: ignore[override]
                 "State error: sample_tokens() must be called "
                 "after execute_model() returns None."
             )
+
+        if not self.uses_sampler or not scheduler_output.total_num_scheduled_tokens:
+            # Model will not execute, call model runner immediately.
+            return self._execute_dag(scheduler_output, None, non_block)
+
+        # Model will execute, defer to sample_tokens() call.
         self.scheduler_output = scheduler_output
         return COMPLETED_NONE_FUTURE if non_block else None
 
@@ -417,10 +428,18 @@ def sample_tokens(  # type: ignore[override]
         """
         scheduler_output = self.scheduler_output
         if scheduler_output is None:
-            return None  # noqa
+            return COMPLETED_NONE_FUTURE if non_block else None  # noqa
 
         self.scheduler_output = None
 
+        return self._execute_dag(scheduler_output, grammar_output, non_block)
+
+    def _execute_dag(
+        self,
+        scheduler_output: SchedulerOutput,
+        grammar_output: "GrammarOutput | None",
+        non_block: bool = False,
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
         # Build the compiled DAG for the first time.
         if self.forward_dag is None:  # type: ignore
             self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
diff --git a/vllm/v1/kv_offload/arc_manager.py b/vllm/v1/kv_offload/arc_manager.py
new file mode 100644
index 000000000000..45b166d6797f
--- /dev/null
+++ b/vllm/v1/kv_offload/arc_manager.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
+from collections.abc import Iterable
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (
+    LoadStoreSpec,
+    OffloadingEvent,
+    OffloadingManager,
+    PrepareStoreOutput,
+)
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+
+
+class ARCOffloadingManager(OffloadingManager):
+    """
+    An OffloadingManager implementing the ARC (Adaptive Replacement Cache)
+    eviction policy with a pluggable backend.
+
+    Data Structures:
+        T1: Recent cache containing blocks accessed once.
+        T2: Frequent cache containing blocks accessed multiple times.
+        B1/B2: Ghost lists tracking recently evicted blocks from T1/T2.
+        target_t1_size: Adaptive target size for the T1 partition.
+
+    Algorithm Flow:
+        1. Cache lookup (lookup):
+           Searches T1 and T2 for block hashes and counts consecutive hits
+           until a miss or non-ready block is encountered.
+
+        2. Cache touch (touch) - Adaptive Learning:
+           For each block_hash (in reverse order):
+           - If in T1: Move to T2 (promotion from recent to frequent).
+           - If in T2: Move to MRU position (end of queue).
+           - If in B1 ghost list: Increase target_t1_size.
+           - If in B2 ghost list: Decrease target_t1_size.
+
+        3. Block eviction (prepare_store) - Adaptive Replacement:
+           Determines eviction source based on adaptive target:
+           - If T1 size > target_t1_size: Evict from T1, add to B1.
+           - Otherwise: Evict from T2, add to B2.
+           Finally, bound each ghost list size.
+
+        4. Block insertion (prepare_store):
+           New blocks are always inserted into T1 and removed from B1/B2 if
+           present. Blocks may later be promoted to T2 during touch operations.
+
+    Adaptive Behavior:
+        The algorithm self-tunes the recency vs. frequency trade-off:
+        - B1 hit: Recent access patterns matter more → increase T1.
+        - B2 hit: Frequent access patterns matter more → decrease T1.
+    """
+
+    def __init__(self, backend: Backend, enable_events: bool = False):
+        self.backend: Backend = backend
+        self.target_t1_size: float = 0.0
+        self.t1: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        self.t2: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        # block_hash -> None (only care about presence)
+        self.b1: OrderedDict[BlockHash, None] = OrderedDict()
+        self.b2: OrderedDict[BlockHash, None] = OrderedDict()
+        self.events: list[OffloadingEvent] | None = [] if enable_events else None
+        self.cache_capacity: int = self.backend.get_num_free_blocks()
+
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        hit_count = 0
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            if block is None or not block.is_ready:
+                break
+            hit_count += 1
+        return hit_count
+
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        blocks = []
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            assert block is not None, f"Block {block_hash!r} not found in cache"
+            assert block.is_ready, f"Block {block_hash!r} is not ready for reading"
+
+            block.ref_cnt += 1
+            blocks.append(block)
+
+        return self.backend.get_load_store_spec(block_hashes, blocks)
+
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in reversed(list(block_hashes)):
+            if block_hash in self.t1:
+                block = self.t1.pop(block_hash)
+                if not block.is_ready:
+                    # block was just prepared to be stored, not really touched twice
+                    self.t1.move_to_end(block_hash)
+                else:
+                    self.t2[block_hash] = block
+
+            elif block_hash in self.t2:
+                self.t2.move_to_end(block_hash)
+
+            elif block_hash in self.b1:
+                delta = max(1, len(self.b2) / len(self.b1))
+                self.target_t1_size = min(
+                    self.target_t1_size + delta, self.cache_capacity
+                )
+                # move to MRU position (end) to keep it fresh in the ghost list
+                self.b1.move_to_end(block_hash)
+
+            elif block_hash in self.b2:
+                delta = max(1, len(self.b1) / len(self.b2))
+                self.target_t1_size = max(self.target_t1_size - delta, 0)
+                # move to MRU position (end) to keep it fresh in the ghost list
+                self.b2.move_to_end(block_hash)
+
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            assert block is not None, f"Block {block_hash!r} not found"
+            assert block.ref_cnt > 0, f"Block {block_hash!r} ref_cnt is already 0"
+
+            block.ref_cnt -= 1
+
+    def prepare_store(
+        self, block_hashes: Iterable[BlockHash]
+    ) -> PrepareStoreOutput | None:
+        block_hashes_to_store = []
+        for block_hash in block_hashes:
+            if block_hash not in self.t1 and block_hash not in self.t2:
+                block_hashes_to_store.append(block_hash)
+
+        if not block_hashes_to_store:
+            return PrepareStoreOutput(
+                block_hashes_to_store=[],
+                store_spec=self.backend.get_load_store_spec([], []),
+                block_hashes_evicted=[],
+            )
+
+        num_blocks_to_evict = (
+            len(block_hashes_to_store) - self.backend.get_num_free_blocks()
+        )
+
+        to_evict = []
+        while num_blocks_to_evict > 0:
+            block_to_evict = None
+            if len(self.t1) >= int(self.target_t1_size):
+                # try to evict the least recently used (oldest) block from T1
+                for block_hash, block in self.t1.items():
+                    if block.ref_cnt == 0:
+                        block_to_evict = (block_hash, block)
+                        eviction_t = self.t1
+                        eviction_b = self.b1
+                        break
+            if not block_to_evict:
+                # try to evict the least recently used (oldest) block from T2
+                for block_hash, block in self.t2.items():
+                    if block.ref_cnt == 0:
+                        block_to_evict = (block_hash, block)
+                        eviction_t = self.t2
+                        eviction_b = self.b2
+                        break
+                else:
+                    # cannot evict enough blocks, cache is full of in-use items
+                    return None
+
+            block_hash, block = block_to_evict
+            del eviction_t[block_hash]
+            eviction_b[block_hash] = None
+            to_evict.append(block_hash)
+            self.backend.free(block)
+            num_blocks_to_evict -= 1
+
+        for b in [self.b1, self.b2]:
+            for i in range(len(b) - self.cache_capacity):
+                b.popitem(last=False)
+
+        if to_evict and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=to_evict,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=True,
+                )
+            )
+
+        blocks = self.backend.allocate_blocks(block_hashes_to_store)
+        assert len(blocks) == len(block_hashes_to_store), (
+            "Backend did not allocate the expected number of blocks"
+        )
+
+        for block_hash, block in zip(block_hashes_to_store, blocks):
+            self.t1[block_hash] = block
+
+            self.b1.pop(block_hash, None)
+            self.b2.pop(block_hash, None)
+
+        store_spec = self.backend.get_load_store_spec(block_hashes_to_store, blocks)
+
+        return PrepareStoreOutput(
+            block_hashes_to_store=block_hashes_to_store,
+            store_spec=store_spec,
+            block_hashes_evicted=to_evict,
+        )
+
+    def complete_store(self, block_hashes: Iterable[BlockHash], success: bool = True):
+        stored_block_hashes: list[BlockHash] = []
+
+        if success:
+            for block_hash in block_hashes:
+                block = self.t1.get(block_hash) or self.t2.get(block_hash)
+
+                if block is not None and not block.is_ready:
+                    block.ref_cnt = 0
+                    stored_block_hashes.append(block_hash)
+        else:
+            for block_hash in block_hashes:
+                block = self.t1.pop(block_hash, None)
+
+                if block is None:
+                    block = self.t2.pop(block_hash, None)
+
+                if block is not None and not block.is_ready:
+                    self.backend.free(block)
+
+        if stored_block_hashes and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=stored_block_hashes,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=False,
+                )
+            )
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        if self.events is not None:
+            yield from self.events
+            self.events.clear()
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index f765d19ea017..4b1bbe6f0cc2 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -8,6 +8,7 @@
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.platforms import current_platform
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
 from vllm.v1.kv_offload.backends.cpu import CPUBackend
 from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
@@ -33,18 +34,32 @@ def __init__(self, vllm_config: VllmConfig):
         # worker-side
         self._handler: OffloadingHandler | None = None
 
+        self.eviction_policy: str = self.extra_config.get("eviction_policy", "lru")
+
     def get_manager(self) -> OffloadingManager:
         if not self._manager:
             kv_events_config = self.vllm_config.kv_events_config
             enable_events = (
                 kv_events_config is not None and kv_events_config.enable_kv_cache_events
             )
-            self._manager = LRUOffloadingManager(
-                CPUBackend(
-                    block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks
-                ),
-                enable_events=enable_events,
+
+            backend = CPUBackend(
+                block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks
             )
+
+            if self.eviction_policy == "lru":
+                self._manager = LRUOffloadingManager(
+                    backend=backend, enable_events=enable_events
+                )
+            elif self.eviction_policy == "arc":
+                self._manager = ARCOffloadingManager(
+                    backend=backend, enable_events=enable_events
+                )
+            else:
+                raise ValueError(
+                    f"Unknown eviction policy: {self.eviction_policy}. "
+                    f"Supported policies: lru, arc"
+                )
         return self._manager
 
     def get_handlers(
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 1a175e9e110b..cb36e7973650 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -118,12 +118,14 @@ def _reset(self, now):
         self.num_prompt_tokens: int = 0
         self.num_generation_tokens: int = 0
         self.num_corrupted_reqs: int = 0
+        self.num_preemptions: int = 0
 
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens += iteration_stats.num_prompt_tokens
         self.num_generation_tokens += iteration_stats.num_generation_tokens
         self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs
+        self.num_preemptions += iteration_stats.num_preempted_reqs
 
     def _get_throughput(self, tracked_stats: int, now: float) -> float:
         # Compute summary metrics for tracked stats
@@ -196,18 +198,31 @@ def log(self):
             "Avg generation throughput: %.1f tokens/s",
             "Running: %d reqs",
             "Waiting: %d reqs",
-            "GPU KV cache usage: %.1f%%",
-            "Prefix cache hit rate: %.1f%%",
         ]
         log_args = [
             self.last_prompt_throughput,
             self.last_generation_throughput,
             self.last_scheduler_stats.num_running_reqs,
             self.last_scheduler_stats.num_waiting_reqs,
-            self.last_scheduler_stats.kv_cache_usage * 100,
-            self.prefix_caching_metrics.hit_rate * 100,
         ]
 
+        if self.num_preemptions > 0:
+            log_parts.append("Preemptions: %d")
+            log_args.append(self.num_preemptions)
+
+        log_parts.extend(
+            [
+                "GPU KV cache usage: %.1f%%",
+                "Prefix cache hit rate: %.1f%%",
+            ]
+        )
+        log_args.extend(
+            [
+                self.last_scheduler_stats.kv_cache_usage * 100,
+                self.prefix_caching_metrics.hit_rate * 100,
+            ]
+        )
+
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             log_parts.append("Corrupted: %d reqs")
             log_args.append(self.num_corrupted_reqs)
@@ -479,6 +494,7 @@ def __init__(
         gauge_kv_cache_usage = self._gauge_cls(
             name="vllm:kv_cache_usage_perc",
             documentation="KV-cache usage. 1 means 100 percent usage.",
+            multiprocess_mode="mostrecent",
             labelnames=labelnames,
         )
         self.gauge_kv_cache_usage = make_per_engine(
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index b5cba96e1026..c0b2835c3124 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -5,8 +5,11 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, NamedTuple
 
+import numpy as np
 import torch
 
+from vllm.v1.core.sched.output import SchedulerOutput
+
 if TYPE_CHECKING:
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 else:
@@ -15,11 +18,11 @@
 
 class LogprobsLists(NamedTuple):
     # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
-    logprob_token_ids: list[list[int]]
+    logprob_token_ids: np.ndarray
     # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
-    logprobs: list[list[float]]
+    logprobs: np.ndarray
     # [num_reqs x num_generated_tokens]
-    sampled_token_ranks: list[int]
+    sampled_token_ranks: np.ndarray
     # [num_reqs]
     # Used for slicing the logprobs in cases like speculative
     # decoding where the number of generated tokens may be
@@ -60,9 +63,9 @@ class LogprobsTensors(NamedTuple):
 
     def tolists(self, cu_num_generated_tokens: list[int] | None = None):
         return LogprobsLists(
-            self.logprob_token_ids.tolist(),
-            self.logprobs.tolist(),
-            self.selected_token_ranks.tolist(),
+            self.logprob_token_ids.cpu().numpy(),
+            self.logprobs.cpu().numpy(),
+            self.selected_token_ranks.cpu().numpy(),
             cu_num_generated_tokens,
         )
 
@@ -135,6 +138,13 @@ def is_empty(self):
         )
 
 
+@dataclass
+class ECConnectorOutput:
+    # [mm_hash]
+    finished_sending: set[str] | None = None
+    finished_recving: set[str] | None = None
+
+
 # ModelRunnerOutput is serialized and sent to the scheduler process.
 # This is expensive for torch.Tensor so prefer to use list instead.
 @dataclass
@@ -148,7 +158,7 @@ class ModelRunnerOutput:
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: list[list[int]]
+    sampled_token_ids: list[np.ndarray]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
@@ -166,6 +176,8 @@ class ModelRunnerOutput:
 
     kv_connector_output: KVConnectorOutput | None = None
 
+    ec_connector_output: ECConnectorOutput | None = None
+
     # req_id -> num_nans_in_logits
     num_nans_in_logits: dict[str, int] | None = None
 
@@ -191,6 +203,41 @@ class DraftTokenIds:
     draft_token_ids: list[list[int]]
 
 
+def make_empty_encoder_model_runner_output(
+    scheduler_output: "SchedulerOutput",
+) -> ModelRunnerOutput:
+    """
+    Create a ModelRunnerOutput stub that contains the correct
+    per-request bookkeeping but no generated data yet.
+    """
+    if not scheduler_output.num_scheduled_tokens:
+        return EMPTY_MODEL_RUNNER_OUTPUT
+
+    # Convert to list so we get a deterministic, indexable sequence
+    req_ids: list[str] = list(scheduler_output.num_scheduled_tokens.keys())
+
+    # Give every request its own contiguous index
+    req_id_to_index: dict[str, int] = {rid: idx for idx, rid in enumerate(req_ids)}
+
+    # No tokens generated yet ⇒ one empty list per request
+    sampled_token_ids: list[list[int]] = [np.array([0]) for _ in req_ids]
+
+    # Pooler outputs are not available yet ⇒ use None placeholders
+    pooler_output: list[torch.Tensor | None] = [None for _ in req_ids]
+
+    return ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_id_to_index,
+        sampled_token_ids=sampled_token_ids,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=pooler_output,
+        kv_connector_output=None,
+        ec_connector_output=None,
+        num_nans_in_logits=None,
+    )
+
+
 EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
     req_ids=[],
     req_id_to_index={},
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 7a5f1183ed48..3d92906fbf4b 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -127,6 +127,8 @@ def __init__(
             self.get_hash_new_full_blocks = partial(block_hasher, self)
             self.block_hashes = self.get_hash_new_full_blocks()
 
+        self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache()
+
     @classmethod
     def from_engine_core_request(
         cls,
@@ -180,6 +182,19 @@ def num_tokens_with_spec(self) -> int:
     def num_output_tokens(self) -> int:
         return len(self._output_token_ids)
 
+    def get_skip_reading_prefix_cache(self) -> bool:
+        if (
+            self.sampling_params is not None
+            and self.sampling_params.skip_reading_prefix_cache is not None
+        ):
+            return self.sampling_params.skip_reading_prefix_cache
+        elif (
+            self.pooling_params is not None
+            and self.pooling_params.skip_reading_prefix_cache is not None
+        ):
+            return self.pooling_params.skip_reading_prefix_cache
+        return False
+
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index eb537eae6c90..8b174af4c779 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -5,7 +5,7 @@
 import itertools
 from abc import abstractmethod
 from collections.abc import Sequence
-from functools import partial
+from functools import lru_cache, partial
 from typing import TYPE_CHECKING
 
 import torch
@@ -41,7 +41,7 @@
 # Error message when the user tries to initialize vLLM with a speculative
 # decoding enabled and custom logitsproces
 STR_SPEC_DEC_REJECTS_LOGITSPROCS = (
-    "Custom logits processors are not supportedwhen speculative decoding is enabled."
+    "Custom logits processors are not supported when speculative decoding is enabled."
 )
 
 LOGITSPROCS_GROUP = "vllm.logits_processors"
@@ -216,11 +216,17 @@ def build_logitsprocs(
     )
 
 
+cached_load_custom_logitsprocs = lru_cache(_load_custom_logitsprocs)
+
+
 def validate_logits_processors_parameters(
     logits_processors: Sequence[str | type[LogitsProcessor]] | None,
     sampling_params: SamplingParams,
 ):
-    for logits_procs in _load_custom_logitsprocs(logits_processors):
+    logits_processors = (
+        tuple(logits_processors) if logits_processors is not None else None
+    )
+    for logits_procs in cached_load_custom_logitsprocs(logits_processors):
         logits_procs.validate_params(sampling_params)
 
 
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 926305d25f56..f31a0cddda9a 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -3,6 +3,7 @@
 
 from dataclasses import replace
 
+import numpy as np
 import torch
 import torch.nn as nn
 
@@ -204,7 +205,7 @@ def _get_logprobs_tensors(
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[list[int]]:
+    ) -> list[np.ndarray]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -220,10 +221,7 @@ def parse_output(
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
-        outputs = [
-            row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
-        ]
-        return outputs
+        return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)]
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 102357ca7c64..0a6806390451 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -8,7 +8,7 @@
 from functools import partial
 from inspect import isclass
 from types import FunctionType
-from typing import Any, TypeAlias
+from typing import Any, TypeAlias, get_type_hints
 
 import cloudpickle
 import msgspec
@@ -16,6 +16,8 @@
 import torch
 import zmq
 from msgspec import msgpack
+from pydantic import GetCoreSchemaHandler
+from pydantic_core import core_schema
 
 from vllm import envs
 from vllm.logger import init_logger
@@ -31,7 +33,7 @@
     MultiModalSharedField,
     NestedTensors,
 )
-from vllm.v1.engine import UtilityResult
+from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.utils import tensor_data
 
 logger = init_logger(__name__)
@@ -103,6 +105,13 @@ def _decode_type_info_recursive(
     return convert_fn(type_info, data)
 
 
+class UtilityResult:
+    """Wrapper for special handling when serializing/deserializing."""
+
+    def __init__(self, r: Any = None):
+        self.result = r
+
+
 class MsgpackEncoder:
     """Encoder with custom torch tensor and numpy array serialization.
 
@@ -282,7 +291,9 @@ class MsgpackDecoder:
     not thread-safe when encoding tensors / numpy arrays.
     """
 
-    def __init__(self, t: Any | None = None):
+    def __init__(self, t: Any | None = None, share_mem: bool = True):
+        self.share_mem = share_mem
+        self.pin_tensors = is_pin_memory_available()
         args = () if t is None else (t,)
         self.decoder = msgpack.Decoder(
             *args, ext_hook=self.ext_hook, dec_hook=self.dec_hook
@@ -347,21 +358,30 @@ def _decode_ndarray(self, arr: Any) -> np.ndarray:
         # zero-copy decode. We assume the ndarray will not be kept around,
         # as it now locks the whole received message buffer in memory.
         buffer = self.aux_buffers[data] if isinstance(data, int) else data
-        return np.frombuffer(buffer, dtype=dtype).reshape(shape)
+        arr = np.frombuffer(buffer, dtype=dtype)
+        if not self.share_mem:
+            arr = arr.copy()
+        return arr.reshape(shape)
 
     def _decode_tensor(self, arr: Any) -> torch.Tensor:
         dtype, shape, data = arr
-        # Copy from inline representation, to decouple the memory storage
-        # of the message from the original buffer. And also make Torch
-        # not complain about a readonly memoryview.
-        buffer = self.aux_buffers[data] if isinstance(data, int) else bytearray(data)
+        is_aux = isinstance(data, int)
+        buffer = self.aux_buffers[data] if is_aux else data
+        buffer = buffer if isinstance(buffer, memoryview) else memoryview(buffer)
         torch_dtype = getattr(torch, dtype)
         assert isinstance(torch_dtype, torch.dtype)
-        if not buffer:  # torch.frombuffer doesn't like empty buffers
+        if not buffer.nbytes:  # torch.frombuffer doesn't like empty buffers
             assert 0 in shape
             return torch.empty(shape, dtype=torch_dtype)
         # Create uint8 array
         arr = torch.frombuffer(buffer, dtype=torch.uint8)
+        # Clone ensures tensor is backed by pytorch-owned memory for safe
+        # future async CPU->GPU transfer.
+        # Pin larger tensors for more efficient CPU->GPU transfer.
+        if not is_aux:
+            arr = arr.clone()
+        elif not self.share_mem:
+            arr = arr.pin_memory() if self.pin_tensors else arr.clone()
         # Convert back to proper shape & type
         return arr.view(torch_dtype).view(shape)
 
@@ -457,3 +477,56 @@ def run_method(
     else:
         func = partial(method, obj)  # type: ignore
     return func(*args, **kwargs)
+
+
+class PydanticMsgspecMixin:
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source_type: Any, handler: GetCoreSchemaHandler
+    ) -> core_schema.CoreSchema:
+        """
+        Make msgspec.Struct compatible with Pydantic, respecting defaults.
+        Handle JSON=>msgspec.Struct. Used when exposing msgspec.Struct to the
+        API as input or in `/docs`. Note this is cached by Pydantic and not
+        called on every validation.
+        """
+        msgspec_fields = {f.name: f for f in msgspec.structs.fields(source_type)}
+        type_hints = get_type_hints(source_type)
+
+        # Build the Pydantic typed_dict_field for each msgspec field
+        fields = {}
+        for name, hint in type_hints.items():
+            msgspec_field = msgspec_fields[name]
+
+            # typed_dict_field using the handler to get the schema
+            field_schema = handler(hint)
+
+            # Add default value to the schema.
+            if msgspec_field.default_factory is not msgspec.NODEFAULT:
+                wrapped_schema = core_schema.with_default_schema(
+                    schema=field_schema,
+                    default_factory=msgspec_field.default_factory,
+                )
+                fields[name] = core_schema.typed_dict_field(wrapped_schema)
+            elif msgspec_field.default is not msgspec.NODEFAULT:
+                wrapped_schema = core_schema.with_default_schema(
+                    schema=field_schema,
+                    default=msgspec_field.default,
+                )
+                fields[name] = core_schema.typed_dict_field(wrapped_schema)
+            else:
+                # No default, so Pydantic will treat it as required
+                fields[name] = core_schema.typed_dict_field(field_schema)
+        return core_schema.no_info_after_validator_function(
+            cls._validate_msgspec,
+            core_schema.typed_dict_schema(fields),
+        )
+
+    @classmethod
+    def _validate_msgspec(cls, value: Any) -> Any:
+        """Validate and convert input to msgspec.Struct instance."""
+        if isinstance(value, cls):
+            return value
+        if isinstance(value, dict):
+            return cls(**value)
+        return msgspec.convert(value, type=cls)
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 75a4140fd655..5bf2503c3027 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -150,11 +150,15 @@ def __init__(
         )
 
         # Determine allowed attention backends once during initialization.
+        from vllm.attention.backends.registry import AttentionBackendEnum
+
         self.allowed_attn_types: tuple | None = None
         if current_platform.is_rocm():
             rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
-            # vllm.v1.attention.backends.rocm_aiter_fa is an optional backend
-            if find_spec("vllm.v1.attention.backends.rocm_aiter_fa"):
+            # ROCM_AITER_FA is an optional backend
+            if find_spec(
+                AttentionBackendEnum.ROCM_AITER_FA.get_path(include_classname=False)
+            ):
                 from vllm.v1.attention.backends.rocm_aiter_fa import (
                     AiterFlashAttentionMetadata,
                 )
@@ -275,7 +279,7 @@ def propose(
         if self.supports_mm_inputs:
             mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
 
-            self.inputs_embeds[:num_tokens] = self.model.get_input_embeddings(
+            self.inputs_embeds[:num_tokens] = self.model.embed_input_ids(
                 self.input_ids[:num_tokens],
                 multimodal_embeddings=mm_embeds,
                 is_multimodal=is_mm_embed,
@@ -393,10 +397,13 @@ def propose(
                 positions += 1
                 exceeds_max_model_len = positions >= self.max_model_len
                 clamped_positions = torch.where(exceeds_max_model_len, 0, positions)
-
+            # For data integrity when async scheduling, we shouldn't use in place
+            # operations in case they are modified in next step's `prepare_input`
+            # of main model.
             # Increment the sequence lengths.
             common_attn_metadata.seq_lens += 1
-            common_attn_metadata.seq_lens_cpu += 1
+            # This is an out-of-place operation to avoid modifying the original tensor.
+            common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1
             # For the requests that exceed the max model length, we set the
             # sequence length to 1 to minimize their overheads in attention.
 
@@ -443,9 +450,7 @@ def propose(
             self._set_positions(batch_size, clamped_positions)
             self.hidden_states[:batch_size] = hidden_states
             if self.supports_mm_inputs:
-                self.inputs_embeds[:batch_size] = self.model.get_input_embeddings(
-                    input_ids
-                )
+                self.inputs_embeds[:batch_size] = self.model.embed_input_ids(input_ids)
 
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:input_batch_size]
@@ -482,7 +487,7 @@ def propose(
 
     def prepare_next_token_ids_cpu(
         self,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
         num_scheduled_tokens: dict[str, int],
@@ -497,7 +502,7 @@ def prepare_next_token_ids_cpu(
         req_ids = gpu_input_batch.req_ids
         next_token_ids: list[int] = []
         for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids:
+            if token_ids.shape[0] > 0:
                 # Common case.
                 next_token_id = token_ids[-1]
             else:
@@ -508,10 +513,9 @@ def prepare_next_token_ids_cpu(
                 seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id]
                 next_token_id = req_state.get_token_id(seq_len)
             next_token_ids.append(next_token_id)
-        next_token_ids = torch.tensor(
+        return torch.tensor(
             next_token_ids, dtype=torch.int32, device=self.input_ids.device
         )
-        return next_token_ids
 
     def prepare_next_token_ids_padded(
         self,
@@ -968,9 +972,7 @@ def load_model(self, target_model: nn.Module) -> None:
             # text-only draft models
             try:
                 dummy_input_ids = torch.tensor([[1]], device=self.input_ids.device)
-                self.model.get_input_embeddings(
-                    dummy_input_ids, multimodal_embeddings=None
-                )
+                self.model.embed_input_ids(dummy_input_ids, multimodal_embeddings=None)
             except (NotImplementedError, AttributeError, TypeError):
                 logger.warning(
                     "Draft model does not support multimodal inputs, "
@@ -992,6 +994,7 @@ def load_model(self, target_model: nn.Module) -> None:
             target_language_model = target_model.get_language_model()
         else:
             target_language_model = target_model
+
         # share embed_tokens with the target model if needed
         if get_pp_group().world_size == 1:
             if hasattr(target_language_model.model, "embed_tokens"):
@@ -1003,52 +1006,92 @@ def load_model(self, target_model: nn.Module) -> None:
                     "Target model does not have 'embed_tokens' or 'embedding' attribute"
                 )
 
-            # Check if shapes match and we found the embedding
-            eagle_shape = self.model.model.embed_tokens.weight.shape
-            target_shape = target_embed_tokens.weight.shape
-            if eagle_shape == target_shape:
-                logger.info(
-                    "Assuming the EAGLE head shares the same vocab embedding"
-                    " with the target model."
-                )
-                del self.model.model.embed_tokens
-                self.model.model.embed_tokens = target_embed_tokens
+            share_embeddings = False
+            if hasattr(self.model, "has_own_embed_tokens"):
+                # EAGLE model
+                if not self.model.has_own_embed_tokens:
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model without its own embed_tokens in the"
+                        " checkpoint. Sharing target model embedding weights with the"
+                        " draft model."
+                    )
+                elif (
+                    isinstance(target_embed_tokens.weight, torch.Tensor)
+                    and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
+                    and torch.equal(
+                        target_embed_tokens.weight, self.model.model.embed_tokens.weight
+                    )
+                ):
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model with embed_tokens identical to the target"
+                        " model. Sharing target model embedding weights with the draft"
+                        " model."
+                    )
+                else:
+                    logger.info(
+                        "Detected EAGLE model with distinct embed_tokens weights. "
+                        "Keeping separate embedding weights from the target model."
+                    )
             else:
+                # MTP model
+                share_embeddings = True
                 logger.info(
-                    "The EAGLE head's vocab embedding will be loaded separately"
-                    " from the target model."
+                    "Detected MTP model. "
+                    "Sharing target model embedding weights with the draft model."
                 )
+
+            if share_embeddings:
+                if hasattr(self.model.model, "embed_tokens"):
+                    del self.model.model.embed_tokens
+                self.model.model.embed_tokens = target_embed_tokens
         else:
             logger.info(
-                "The EAGLE head's vocab embedding will be loaded separately"
+                "The draft model's vocab embedding will be loaded separately"
                 " from the target model."
             )
 
         # share lm_head with the target model if needed
-        # some model definition do not define lm_head explicitly
-        # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM
-        if self.vllm_config.speculative_config.method != "eagle3":
-            if hasattr(target_language_model, "lm_head"):
-                logger.info("Loading EAGLE LM head weights from the target model.")
-                self.model.lm_head = target_language_model.lm_head
-        else:
-            if (
-                hasattr(self.model, "lm_head")
-                and hasattr(target_language_model, "lm_head")
-                and self.model.lm_head.weight.shape
-                == target_language_model.lm_head.weight.shape
+        share_lm_head = False
+        if hasattr(self.model, "has_own_lm_head"):
+            # EAGLE model
+            if not self.model.has_own_lm_head:
+                share_lm_head = True
+                logger.info(
+                    "Detected EAGLE model without its own lm_head in the checkpoint. "
+                    "Sharing target model lm_head weights with the draft model."
+                )
+            elif (
+                hasattr(target_language_model, "lm_head")
+                and isinstance(target_language_model.lm_head.weight, torch.Tensor)
+                and isinstance(self.model.lm_head.weight, torch.Tensor)
+                and torch.equal(
+                    target_language_model.lm_head.weight, self.model.lm_head.weight
+                )
             ):
+                share_lm_head = True
                 logger.info(
-                    "Assuming the EAGLE head shares the same lm_head"
-                    " with the target model."
+                    "Detected EAGLE model with lm_head identical to the target model. "
+                    "Sharing target model lm_head weights with the draft model."
                 )
-                del self.model.lm_head
-                self.model.lm_head = target_language_model.lm_head
             else:
                 logger.info(
-                    "The EAGLE head's lm_head will be loaded separately"
-                    " from the target model."
+                    "Detected EAGLE model with distinct lm_head weights. "
+                    "Keeping separate lm_head weights from the target model."
                 )
+        else:
+            # MTP model
+            share_lm_head = True
+            logger.info(
+                "Detected MTP model. "
+                "Sharing target model lm_head weights with the draft model."
+            )
+
+        if share_lm_head and hasattr(target_language_model, "lm_head"):
+            if hasattr(self.model, "lm_head"):
+                del self.model.lm_head
+            self.model.lm_head = target_language_model.lm_head
 
     @torch.inference_mode()
     def dummy_run(
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index e2f83cb24aa9..378937dba988 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -54,7 +54,7 @@ def __init__(self, vllm_config: VllmConfig):
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
         self.propose(
-            [[]] * 1024,
+            [np.array([])] * 1024,
             [""] * 1024,
             np.zeros(1024, dtype=np.int32),
             np.zeros((1024, self.max_model_len), dtype=np.int32),
@@ -131,7 +131,7 @@ def batch_propose(
 
     def propose(
         self,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
         req_ids: list[str],
         num_tokens_no_spec: np.ndarray,
         token_ids_cpu: np.ndarray,
@@ -140,7 +140,7 @@ def propose(
         # find which requests need ngram proposals
         valid_ngram_requests = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            num_sampled_ids = len(sampled_ids)
+            num_sampled_ids = sampled_ids.shape[0]
             if not num_sampled_ids:
                 # Skip speculative decoding.
                 continue
diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py
index 049e335db325..d76e0ffe778d 100644
--- a/vllm/v1/spec_decode/suffix_decoding.py
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+
 from vllm.config import VllmConfig
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -32,16 +34,16 @@ def __init__(self, vllm_config: VllmConfig):
     def propose(
         self,
         input_batch: InputBatch,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
     ) -> list[list[int]]:
         """
         Propose speculative tokens for each request in the input batch. Suffix Decoding
         will speculate a dynamic number of tokens for each request every decoding step,
         so each entry in the returned list may have different lengths.
         """
-        draft_token_ids: list[list[int]] = []
+        draft_token_ids: list[np.ndarray] = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            if not sampled_ids:
+            if sampled_ids.shape[0] == 0:
                 # Skip speculative decoding for partial prefills.
                 draft_token_ids.append([])
                 continue
@@ -70,7 +72,7 @@ def propose(
                 self.suffix_cache.start_request(req_id, prompt_token_ids)
 
             # Append the newly sampled ids to the suffix cache for this request.
-            self.suffix_cache.add_active_response(req_id, sampled_ids)
+            self.suffix_cache.add_active_response(req_id, sampled_ids.tolist())
 
             # Suffix decoding only uses the most recent tokens up to max_tree_depth, so
             # we extract the pattern from the end of the input.
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 00a625e103bd..2962a439dcb3 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -111,6 +111,7 @@ class GuidanceGrammar(StructuredOutputGrammar):
     vocab_size: int
     printed_error: bool = False
     terminated: bool = False
+    rollback_lag: int = 0
 
     def check_error(self):
         if not self.printed_error:
@@ -127,6 +128,8 @@ def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
         """
 
         if self.ll_tokenizer.eos_token in tokens:
+            if self.ll_matcher.is_stopped() and not self.terminated:
+                self.rollback_lag = 1
             self.terminated = True
 
         if self.ll_matcher.is_stopped():
@@ -163,8 +166,11 @@ def validate_tokens(self, tokens: list[int]) -> list[int]:
         return tokens[:num_tokens]
 
     def rollback(self, num_tokens: int) -> None:
-        self.ll_matcher.rollback(num_tokens)
-        self.check_error()
+        if num_tokens > 0:
+            self.ll_matcher.rollback(num_tokens - self.rollback_lag)
+            self.terminated = False
+            self.rollback_lag = 0
+            self.check_error()
 
     def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
         # this will automatically return [EOS] mask if the matcher is stopped
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index a401f6d74cdd..29099d1e9b17 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -97,6 +97,9 @@ def __len__(self):
     def __repr__(self):
         return f"ConstantList({self._x})"
 
+    def copy(self) -> list[T]:
+        return self._x.copy()
+
 
 class CpuGpuBuffer:
     """Buffer to easily copy tensors between CPU and GPU."""
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index c28bf542f85c..9f6c19e46430 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -98,7 +98,9 @@ def append_row(
             return
 
         if self.use_hybrid_blocks:
-            block_ids = self._map_to_kernel_blocks(np.array(block_ids))
+            block_ids = self.map_to_kernel_blocks(
+                np.array(block_ids), self.blocks_per_kv_block, self._kernel_block_arange
+            )
 
         num_blocks = len(block_ids)
         start = self.num_blocks_per_row[row_idx]
@@ -188,7 +190,12 @@ def clear(self) -> None:
         self.block_table.gpu.fill_(0)
         self.block_table.cpu.fill_(0)
 
-    def _map_to_kernel_blocks(self, kv_manager_block_ids: np.ndarray) -> np.ndarray:
+    @staticmethod
+    def map_to_kernel_blocks(
+        kv_manager_block_ids: np.ndarray,
+        blocks_per_kv_block: int,
+        kernel_block_arange: np.ndarray,
+    ) -> np.ndarray:
         """Convert kv_manager_block_id IDs to kernel block IDs.
 
         Example:
@@ -203,12 +210,12 @@ def _map_to_kernel_blocks(self, kv_manager_block_ids: np.ndarray) -> np.ndarray:
             # kv_manager_block_id 1 → kernel block id [2, 3]
             # kv_manager_block_id 2 → kernel block id [4, 5]
         """
-        if not self.use_hybrid_blocks:
+        if blocks_per_kv_block == 1:
             return kv_manager_block_ids
 
         kernel_block_ids = (
-            kv_manager_block_ids.reshape(-1, 1) * self.blocks_per_kv_block
-            + self._kernel_block_arange
+            kv_manager_block_ids.reshape(-1, 1) * blocks_per_kv_block
+            + kernel_block_arange
         )
 
         return kernel_block_ids.reshape(-1)
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 5aebfec06dfd..40f011fed1ad 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -12,9 +12,6 @@
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-
 logger = init_logger(__name__)
 
 
@@ -31,15 +28,6 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
 
         self._postprocess_tensors()
 
-    # Note: Remove the override after new attention backend finished
-    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
-        if len(self.kv_cache_config.kv_cache_groups) > 1:
-            raise ValueError(
-                "Multiple KVCacheGroups is not"
-                "currently supported with CPU model runner."
-            )
-        super()._may_reorder_batch(scheduler_output)
-
     def _postprocess_tensors(self) -> None:
         # Note: replace device tensors with cpu tensors
         def replace_tensor(obj: Any, cpu_attr_name: str, device_attr_name) -> None:
@@ -92,9 +80,6 @@ def _init_device_properties(self) -> None:
     def _sync_device(self) -> None:
         pass
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
-        return sampled_token_ids.tolist()
-
     def get_dp_padding(self, num_tokens: int) -> tuple[int, torch.Tensor | None]:
         # Note: For CPU backend, dp padding is not required for now.
         return 0, None
diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py
new file mode 100644
index 000000000000..00bc909df297
--- /dev/null
+++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Define EC connector functionality mixin for model runners.
+"""
+
+from collections.abc import Generator
+from contextlib import AbstractContextManager, contextmanager, nullcontext
+from typing import (
+    TYPE_CHECKING,  # noqa: UP035
+)
+
+import torch
+
+from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
+from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorBase
+from vllm.logger import init_logger
+from vllm.v1.outputs import ECConnectorOutput
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+logger = init_logger(__name__)
+
+
+# Defined as a EC connector functionality mixin for ModelRunner (GPU, TPU)
+class ECConnectorModelRunnerMixin:
+    @staticmethod
+    def maybe_save_ec_to_connector(
+        encoder_cache: dict[str, torch.Tensor],
+        mm_hash: str,
+    ):
+        if not has_ec_transfer():
+            logger.debug("Not have ec transfer please check")
+            return
+        connector = get_ec_transfer()
+        connector.save_caches(encoder_cache=encoder_cache, mm_hash=mm_hash)
+
+    @staticmethod
+    def get_finished_ec_transfers(
+        scheduler_output: "SchedulerOutput",
+    ) -> tuple[set[str] | None, set[str] | None]:
+        if has_ec_transfer():
+            return get_ec_transfer().get_finished(scheduler_output.finished_req_ids)
+        return None, None
+
+    @staticmethod
+    def maybe_get_ec_connector_output(
+        scheduler_output: "SchedulerOutput",
+        encoder_cache: dict[str, torch.Tensor],
+        **kwargs,
+    ) -> AbstractContextManager[ECConnectorOutput | None]:
+        return (
+            ECConnectorModelRunnerMixin._get_ec_connector_output(
+                scheduler_output, encoder_cache, **kwargs
+            )
+            if has_ec_transfer()
+            else nullcontext()
+        )
+
+    # This context manager must be used within an active forward context.
+    # It encapsulates the entire EC conector lifecycle within execute_model
+    @staticmethod
+    @contextmanager
+    def _get_ec_connector_output(
+        scheduler_output: "SchedulerOutput",
+        encoder_cache: dict[str, torch.Tensor],
+        **kwargs,
+    ) -> Generator[ECConnectorOutput, None, None]:
+        output = ECConnectorOutput()
+
+        ec_connector = get_ec_transfer()
+        assert isinstance(ec_connector, ECConnectorBase)
+        assert scheduler_output.ec_connector_metadata is not None
+        ec_connector.bind_connector_metadata(scheduler_output.ec_connector_metadata)
+
+        if not ec_connector.is_producer:
+            ec_connector.start_load_caches(encoder_cache, **kwargs)
+
+        try:
+            yield output
+        finally:
+            output.finished_sending, output.finished_recving = (
+                ec_connector.get_finished(scheduler_output.finished_req_ids)
+            )
+
+            ec_connector.clear_connector_metadata()
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 393181f543d2..7cf6afa3fc37 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -46,6 +46,9 @@ class CachedRequestState:
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
 
+    # Used when both async_scheduling and spec_decode are enabled.
+    prev_num_draft_len: int = 0
+
     def __post_init__(self):
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
             self.prompt_token_ids, self.prompt_embeds
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 26007d29d61b..67f575f92cc6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -7,7 +7,7 @@
 from collections import defaultdict
 from collections.abc import Iterator
 from contextlib import contextmanager
-from copy import deepcopy
+from copy import copy, deepcopy
 from functools import reduce
 from itertools import product
 from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast
@@ -35,6 +35,7 @@
     get_layers_from_vllm_config,
     update_config,
 )
+from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group
 from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
@@ -114,12 +115,14 @@
     EMPTY_MODEL_RUNNER_OUTPUT,
     AsyncModelRunnerOutput,
     DraftTokenIds,
+    ECConnectorOutput,
     KVConnectorOutput,
     LogprobsLists,
     LogprobsTensors,
     ModelRunnerOutput,
     PoolerOutput,
     SamplerOutput,
+    make_empty_encoder_model_runner_output,
 )
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
@@ -134,6 +137,7 @@
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
+from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@@ -175,6 +179,7 @@ def __init__(
         logprobs_tensors: torch.Tensor | None,
         invalid_req_indices: list[int],
         async_output_copy_stream: torch.cuda.Stream,
+        vocab_size: int,
     ):
         self._model_runner_output = model_runner_output
         self._invalid_req_indices = invalid_req_indices
@@ -185,6 +190,7 @@ def __init__(
         # Keep a reference to the device tensor to avoid it being
         # deallocated until we finish copying it to the host.
         self._sampled_token_ids = sampled_token_ids
+        self.vocab_size = vocab_size
         self._logprobs_tensors = logprobs_tensors
 
         # Initiate the copy on a separate stream, but do not synchronize it.
@@ -211,10 +217,18 @@ def get_output(self) -> ModelRunnerOutput:
         # Release the device tensors once the copy has completed.
         del self._logprobs_tensors
         del self._sampled_token_ids
-
-        valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
+        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
+        if max_gen_len == 1:
+            valid_sampled_token_ids: list[np.ndarray] = [
+                row for row in self.sampled_token_ids_cpu.numpy()
+            ]
+        else:
+            valid_sampled_token_ids = RejectionSampler.parse_output(
+                self.sampled_token_ids_cpu,
+                self.vocab_size,
+            )
         for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i].clear()
+            valid_sampled_token_ids[i] = np.array([])
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
@@ -236,10 +250,12 @@ class ExecuteModelState(NamedTuple):
     hidden_states: torch.Tensor
     sample_hidden_states: torch.Tensor
     aux_hidden_states: list[torch.Tensor] | None
-    kv_connector_output: KVConnectorOutput | None
+    ec_connector_output: ECConnectorOutput | None
 
 
-class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+class GPUModelRunner(
+    LoRAModelRunnerMixin, KVConnectorModelRunnerMixin, ECConnectorModelRunnerMixin
+):
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -279,6 +295,9 @@ def __init__(
         # This will be overridden in load_model()
         self.is_multimodal_pruning_enabled = False
         self.max_model_len = model_config.max_model_len
+
+        # Always set to false after the first forward pass
+        self.calculate_kv_scales = self.cache_config.calculate_kv_scales
         self.dcp_world_size = self.parallel_config.decode_context_parallel_size
         self.dcp_rank = 0 if self.dcp_world_size <= 1 else get_dcp_group().rank_in_group
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
@@ -305,6 +324,7 @@ def __init__(
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
+        self.uses_custom_attention_masks = model_config.uses_custom_attention_masks
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
             model_config
         )
@@ -365,6 +385,10 @@ def __init__(
                 )
             self.rejection_sampler = RejectionSampler(self.sampler)
 
+        self.num_spec_tokens = 0
+        if self.speculative_config:
+            self.num_spec_tokens = self.speculative_config.num_speculative_tokens
+
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
         self.comm_stream = torch.cuda.Stream()
@@ -501,11 +525,7 @@ def __init__(
                 self.max_num_tokens, dtype=torch.int32, device=self.device
             )
 
-        self.uniform_decode_query_len = (
-            1
-            if not self.speculative_config
-            else 1 + self.speculative_config.num_speculative_tokens
-        )
+        self.uniform_decode_query_len = 1 + self.num_spec_tokens
 
         # Cudagraph dispatcher for runtime cudagraph dispatching.
         self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
@@ -537,8 +557,23 @@ def __init__(
             pin_memory=self.pin_memory,
         )
 
+        # Pre-allocated tensor for copying valid sampled token counts to CPU,
+        # with dedicated stream for overlapping and event for coordination.
+        self.valid_sampled_token_count_event: torch.cuda.Event | None = None
+        self.valid_sampled_token_count_copy_stream: torch.cuda.Stream | None = None
+        if self.use_async_scheduling and self.num_spec_tokens:
+            self.valid_sampled_token_count_event = torch.cuda.Event()
+            self.valid_sampled_token_count_copy_stream = torch.cuda.Stream()
+        self.valid_sampled_token_count_cpu = torch.empty(
+            self.max_num_reqs,
+            dtype=torch.int64,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+
         # Ephemeral state transferred between execute_model() and sample_tokens().
         self.execute_model_state: ExecuteModelState | None = None
+        self.kv_connector_output: KVConnectorOutput | None = None
 
     def reset_mm_cache(self) -> None:
         if self.mm_budget:
@@ -618,16 +653,6 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
             return
 
         if self.reorder_batch_threshold is not None:
-            # NOTE(lucas): currently no backend supports the custom masking
-            #  required for DCP with q_len > 1, so we assert here. Remove this
-            #  assert once the custom mask is support is added to FA3.
-            if (
-                self.dcp_world_size > 1
-                and envs.VLLM_ATTENTION_BACKEND != "FLASH_ATTN_MLA"
-            ):
-                assert self.reorder_batch_threshold == 1, (
-                    "DCP not support reorder_batch_threshold > 1 now."
-                )
             reorder_batch_to_split_decodes_and_prefills(
                 self.input_batch,
                 scheduler_output,
@@ -734,17 +759,45 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Update the states of the running/resumed requests.
         is_last_rank = get_pp_group().is_last_rank
         req_data = scheduler_output.scheduled_cached_reqs
+
+        # Wait until valid_sampled_tokens_count is copied to cpu,
+        # then use it to update actual num_computed_tokens of each request.
+        valid_sampled_token_count = self._get_valid_sampled_token_count()
+
         for i, req_id in enumerate(req_data.req_ids):
             req_state = self.requests[req_id]
             num_computed_tokens = req_data.num_computed_tokens[i]
             new_block_ids = req_data.new_block_ids[i]
             resumed_from_preemption = req_id in req_data.resumed_req_ids
             num_output_tokens = req_data.num_output_tokens[i]
+            req_index = self.input_batch.req_id_to_index.get(req_id)
 
-            # Update the cached states.
+            # prev_num_draft_len is used in async scheduling mode with
+            # spec decode. it indicates if need to update num_computed_tokens
+            # of the request. for example:
+            # fist step: num_computed_tokens = 0, spec_tokens = [],
+            # prev_num_draft_len = 0.
+            # second step: num_computed_tokens = 100(prompt lenth),
+            # spec_tokens = [a,b], prev_num_draft_len = 0.
+            # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
+            # prev_num_draft_len = 2.
+            # num_computed_tokens in first step and second step does't contain
+            # the spec tokens length, but in third step it contains the
+            # spec tokens length. we only need to update num_computed_tokens
+            # when prev_num_draft_len > 0.
+            if req_state.prev_num_draft_len:
+                if req_index is None:
+                    req_state.prev_num_draft_len = 0
+                else:
+                    assert self.input_batch.prev_req_id_to_index is not None
+                    prev_req_index = self.input_batch.prev_req_id_to_index[req_id]
+                    num_accepted = valid_sampled_token_count[prev_req_index] - 1
+                    num_rejected = req_state.prev_num_draft_len - num_accepted
+                    num_computed_tokens -= num_rejected
+                    req_state.output_token_ids.extend([-1] * num_accepted)
 
+            # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
-            req_index = self.input_batch.req_id_to_index.get(req_id)
 
             if not is_last_rank:
                 # When using PP, the scheduler sends the sampled tokens back,
@@ -821,8 +874,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
                 req_id, []
             )
-            if spec_token_ids:
-                num_spec_tokens = len(spec_token_ids)
+            num_spec_tokens = len(spec_token_ids)
+            # For async scheduling, token_ids_cpu assigned from
+            # spec_token_ids are placeholders and will be overwritten in
+            # _prepare_input_ids.
+            if num_spec_tokens:
                 start_index = self.input_batch.num_tokens_no_spec[req_index]
                 end_token_index = start_index + num_spec_tokens
                 self.input_batch.token_ids_cpu[
@@ -838,6 +894,15 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             # even when speculative decoding is enabled.
             self.input_batch.spec_token_ids[req_index] = spec_token_ids
 
+            # there are no draft tokens with async scheduling,
+            # we clear the spec_decoding info in scheduler_output and
+            # use normal sampling but rejection_sampling.
+            if self.use_async_scheduling:
+                req_state.prev_num_draft_len = num_spec_tokens
+                if num_spec_tokens and self._draft_token_ids is None:
+                    scheduler_output.total_num_scheduled_tokens -= num_spec_tokens
+                    scheduler_output.num_scheduled_tokens[req_id] -= num_spec_tokens
+                    scheduler_output.scheduled_spec_decode_tokens.pop(req_id, None)
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         for request in reqs_to_add:
@@ -889,38 +954,13 @@ def _update_states_after_model_execute(
             self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
 
     def _init_mrope_positions(self, req_state: CachedRequestState):
-        image_grid_thw = []
-        video_grid_thw = []
-        second_per_grid_ts = []
-        audio_feature_lengths = []
-        use_audio_in_video = False
-        for mm_feature in req_state.mm_features:
-            mm_item = mm_feature.data
-            if mm_item is None:
-                continue
-            mm_input = mm_item.get_data()
-            if (t := mm_input.get("image_grid_thw")) is not None:
-                image_grid_thw.append(t.tolist())
-            if (t := mm_input.get("video_grid_thw")) is not None:
-                video_grid_thw.append(t.tolist())
-            if (t := mm_input.get("second_per_grid_ts")) is not None:
-                second_per_grid_ts.append(t)
-            if (t := mm_input.get("audio_feature_lengths")) is not None:
-                audio_feature_lengths.append(t)
-            if mm_input.get("use_audio_in_video") is True:
-                use_audio_in_video = True
-
-        assert supports_mrope(self.get_model()), "M-RoPE support is not implemented."
+        model = self.get_model()
+        assert supports_mrope(model), "M-RoPE support is not implemented."
 
         req_state.mrope_positions, req_state.mrope_position_delta = (
-            self.model.get_mrope_input_positions(
+            model.get_mrope_input_positions(
                 req_state.prompt_token_ids,
-                hf_config=self.model_config.hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                audio_feature_lengths=audio_feature_lengths,
-                use_audio_in_video=use_audio_in_video,
+                req_state.mm_features,
             )
         )
 
@@ -982,7 +1022,10 @@ def _get_cumsum_and_arange(
         return cu_num_tokens, arange
 
     def _prepare_input_ids(
-        self, total_num_scheduled_tokens: int, cu_num_tokens: np.ndarray
+        self,
+        scheduler_output: "SchedulerOutput",
+        total_num_scheduled_tokens: int,
+        cu_num_tokens: np.ndarray,
     ) -> None:
         """Prepare the input IDs for the current batch.
 
@@ -1003,21 +1046,43 @@ def _prepare_input_ids(
         # on the GPU from prev_sampled_token_ids.
         prev_req_id_to_index = self.input_batch.prev_req_id_to_index
         assert prev_req_id_to_index is not None
-        flattened_indices = []
-        prev_common_req_indices = []
+        sample_flattened_indices: list[int] = []
+        spec_flattened_indices: list[int] = []
+        prev_common_req_indices: list[int] = []
+        prev_draft_token_indices: list[int] = []
         indices_match = True
         max_flattened_index = -1
+        total_num_spec_tokens = 0
+        scheduled_spec_tokens = scheduler_output.scheduled_spec_decode_tokens
+
         for req_id, cur_index in self.input_batch.req_id_to_index.items():
             if (prev_index := prev_req_id_to_index.get(req_id)) is not None:
                 prev_common_req_indices.append(prev_index)
                 # We need to compute the flattened input_ids index of the
                 # last token in each common request.
+                draft_len = len(scheduled_spec_tokens.get(req_id, ()))
+                total_num_spec_tokens += draft_len
                 flattened_index = cu_num_tokens[cur_index].item() - 1
-                flattened_indices.append(flattened_index)
+                # example: cu_num_tokens = [2, 5, 8], draft_tokens = [1, 2, 2]
+                # sample_flattened_indices = [0, 2, 5]
+                # spec_flattened_indices = [1,   3, 4,    6, 7]
+                sample_flattened_indices.append(flattened_index - draft_len)
+                spec_flattened_indices.extend(
+                    range(flattened_index - draft_len + 1, flattened_index + 1)
+                )
+                start = prev_index * self.num_spec_tokens
+                # prev_draft_token_indices is used to find which draft_tokens_id
+                # should be copied to input_ids
+                # example: prev draft_tokens_id [[1,2], [3,4], [5, 6]]
+                # flatten draft_tokens_id [1,2,3,4,5,6]
+                # draft_len of each request [1, 2, 1]
+                # then prev_draft_token_indices is [0,   2, 3,   4]
+                prev_draft_token_indices.extend(range(start, start + draft_len))
                 indices_match &= prev_index == flattened_index
                 max_flattened_index = max(max_flattened_index, flattened_index)
-        num_commmon_tokens = len(flattened_indices)
-        if num_commmon_tokens < total_num_scheduled_tokens:
+        num_commmon_tokens = len(sample_flattened_indices)
+        total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
+        if num_commmon_tokens < total_without_spec:
             # If not all requests are decodes from the last iteration,
             # We need to copy the input_ids_cpu to the GPU first.
             self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
@@ -1041,20 +1106,43 @@ def _prepare_input_ids(
                 self.is_token_ids.gpu[:num_commmon_tokens] = True
             return
         # Upload the index tensors asynchronously so the scatter can be non-blocking.
-        input_ids_index_tensor = torch.tensor(
-            flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        sampled_tokens_index_tensor = torch.tensor(
+            sample_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
         ).to(self.device, non_blocking=True)
         prev_common_req_indices_tensor = torch.tensor(
             prev_common_req_indices, dtype=torch.int64, pin_memory=self.pin_memory
         ).to(self.device, non_blocking=True)
         self.input_ids.gpu.scatter_(
             dim=0,
-            index=input_ids_index_tensor,
+            index=sampled_tokens_index_tensor,
             src=self.input_batch.prev_sampled_token_ids[
                 prev_common_req_indices_tensor, 0
             ],
         )
 
+        # Scatter the draft tokens after the sampled tokens are scattered.
+        if self._draft_token_ids is None or not spec_flattened_indices:
+            return
+
+        assert isinstance(self._draft_token_ids, torch.Tensor)
+        draft_tokens_index_tensor = torch.tensor(
+            spec_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
+        prev_draft_token_indices_tensor = torch.tensor(
+            prev_draft_token_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
+
+        # because input_ids dtype is torch.int32,
+        # so convert draft_token_ids to torch.int32 here.
+        draft_token_ids = self._draft_token_ids.to(dtype=torch.int32)
+        self._draft_token_ids = None
+
+        self.input_ids.gpu.scatter_(
+            dim=0,
+            index=draft_tokens_index_tensor,
+            src=draft_token_ids.flatten()[prev_draft_token_indices_tensor],
+        )
+
     def _get_encoder_seq_lens(
         self,
         scheduled_encoder_inputs: dict[str, list[int]],
@@ -1241,7 +1329,11 @@ def _prepare_inputs(
         self.discard_request_indices.copy_to_gpu(self.num_discarded_requests)
 
         # Copy the tensors to the GPU.
-        self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
+        self._prepare_input_ids(
+            scheduler_output,
+            total_num_scheduled_tokens,
+            cu_num_tokens,
+        )
 
         if self.uses_mrope:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -1329,7 +1421,7 @@ def _build_attention_metadata(
         :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
         """
         logits_indices_padded = None
-        num_logits_indices = 0
+        num_logits_indices = None
         if logits_indices is not None:
             num_logits_indices = logits_indices.size(0)
             if self.cache_config.kv_sharing_fast_prefill:
@@ -1868,7 +1960,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
                         )
                     )
 
-                    micro_batch_outputs = model.get_multimodal_embeddings(
+                    micro_batch_outputs = model.embed_multimodal(
                         **micro_batch_mm_inputs
                     )
 
@@ -1881,7 +1973,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-                curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
+                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -1895,6 +1987,8 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
                 output,
                 is_embed=pos_info.is_embed,
             )
+            logger.debug("Finish execute for mm hash %s", mm_hash)
+            self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
 
     def _gather_mm_embeddings(
         self,
@@ -2044,7 +2138,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]:
 
         supported_tasks = list(model.pooler.get_supported_tasks())
 
-        if self.scheduler_config.chunked_prefill_enabled:
+        if self.scheduler_config.enable_chunked_prefill:
             if "token_embed" in supported_tasks:
                 supported_tasks.remove("token_embed")
             if "token_classify" in supported_tasks:
@@ -2213,25 +2307,32 @@ def _preprocess(
         torch.Tensor,
         IntermediateTensors | None,
         dict[str, Any],
+        ECConnectorOutput | None,
     ]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         is_first_rank = get_pp_group().is_first_rank
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
+        ec_connector_output = None
+
         if (
             self.supports_mm_inputs
             and is_first_rank
             and not self.model_config.is_encoder_decoder
         ):
             # Run the multimodal encoder if any.
-            self._execute_mm_encoder(scheduler_output)
-            mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output)
+            with self.maybe_get_ec_connector_output(
+                scheduler_output,
+                encoder_cache=self.encoder_cache,
+            ) as ec_connector_output:
+                self._execute_mm_encoder(scheduler_output)
+                mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output)
 
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            inputs_embeds_scheduled = self.model.get_input_embeddings(
+            inputs_embeds_scheduled = self.model.embed_input_ids(
                 self.input_ids.gpu[:num_scheduled_tokens],
                 multimodal_embeddings=mm_embeds,
                 is_multimodal=is_mm_embed,
@@ -2246,6 +2347,24 @@ def _preprocess(
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
             }
+
+            # Generate custom attention masks for models that require them.
+            # V1 pre-generates embeddings, so forward() skips prepare_attn_masks().
+            # Check mm_features (mm_embeds is empty during decode).
+            has_mm_features = any(
+                req_state.mm_features for req_state in self.requests.values()
+            )
+            if (
+                self.uses_custom_attention_masks
+                and has_mm_features
+                and hasattr(self.model, "generate_attention_masks")
+            ):
+                mask_kwargs = self.model.generate_attention_masks(
+                    self.input_ids.gpu[:num_scheduled_tokens],
+                    self.positions.gpu[:num_scheduled_tokens],
+                    mask_dtype=self.model.dtype,
+                )
+                model_kwargs.update(mask_kwargs)
         elif self.enable_prompt_embeds and is_first_rank:
             # Get the input embeddings for the tokens that are not input embeds,
             # then put them into the appropriate positions.
@@ -2267,7 +2386,7 @@ def _preprocess(
             # Some tokens ids may need to become embeds
             if token_ids_idx.numel() > 0:
                 token_ids = self.input_ids.gpu[token_ids_idx]
-                tokens_to_embeds = self.model.get_input_embeddings(input_ids=token_ids)
+                tokens_to_embeds = self.model.embed_input_ids(input_ids=token_ids)
                 self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds
 
             inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
@@ -2306,6 +2425,7 @@ def _preprocess(
             positions,
             intermediate_tensors,
             model_kwargs,
+            ec_connector_output,
         )
 
     def _sample(
@@ -2344,7 +2464,7 @@ def _bookkeeping_sync(
     ) -> tuple[
         dict[str, int],
         LogprobsLists | None,
-        list[list[int]],
+        list[np.ndarray],
         dict[str, LogprobsTensors | None],
         list[str],
         dict[str, int],
@@ -2370,6 +2490,7 @@ def _bookkeeping_sync(
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
         invalid_req_indices = []
+        valid_sampled_token_ids: list[np.ndarray]
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2384,17 +2505,19 @@ def _bookkeeping_sync(
                 )
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)].clear()
+                valid_sampled_token_ids[int(i)] = np.array([])
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
             invalid_req_indices_set = set(invalid_req_indices)
-            assert sampled_token_ids.shape[-1] == 1
 
             # Cache the sampled tokens on the GPU and avoid CPU sync.
             # These will be copied into input_ids in the next step
             # when preparing inputs.
-            self.input_batch.prev_sampled_token_ids = sampled_token_ids
+            # With spec decoding, this is done in propose_draft_token_ids().
+            if self.input_batch.prev_sampled_token_ids is None:
+                assert sampled_token_ids.shape[-1] == 1
+                self.input_batch.prev_sampled_token_ids = sampled_token_ids
             self.input_batch.prev_req_id_to_index = {
                 req_id: i
                 for i, req_id in enumerate(self.input_batch.req_ids)
@@ -2412,19 +2535,24 @@ def _bookkeeping_sync(
             [0] if spec_decode_metadata and logprobs_tensors else None
         )
         for req_idx in range(num_sampled_tokens):
+            sampled_ids: np.ndarray | None
             if self.use_async_scheduling:
-                sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
+                sampled_ids = (
+                    np.array([-1]) if req_idx not in invalid_req_indices_set else None
+                )
             else:
                 sampled_ids = valid_sampled_token_ids[req_idx]
 
-            num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
+            num_sampled_ids: int = (
+                sampled_ids.shape[0] if sampled_ids is not None else 0
+            )
 
             if cu_num_accepted_tokens is not None:
                 cu_num_accepted_tokens.append(
                     cu_num_accepted_tokens[-1] + num_sampled_ids
                 )
 
-            if not sampled_ids:
+            if sampled_ids is None or num_sampled_ids == 0:
                 continue
 
             start_idx = self.input_batch.num_tokens_no_spec[req_idx]
@@ -2524,13 +2652,48 @@ def execute_model(
                 "State error: sample_tokens() must be called "
                 "after execute_model() returns None."
             )
+
+        # self._draft_token_ids is None when `input_fits_in_drafter=False`
+        # and there is no draft tokens scheduled. so it need to update the
+        # spec_decoding info in scheduler_output with async_scheduling.
+        # use deepcopy to avoid the modification has influence on the
+        # scheduler_output in engine core process.
+        # TODO(Ronald1995): deepcopy is expensive when there is a large
+        # number of requests, optimize it later.
+        if (
+            self.use_async_scheduling
+            and self.num_spec_tokens
+            and self._draft_token_ids is None
+        ):
+            scheduler_output = deepcopy(scheduler_output)
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        with record_function_or_nullcontext("Preprocess"):
+        with record_function_or_nullcontext("gpu_model_runner: preprocess"):
             with self.synchronize_input_prep():
                 # Update persistent batch states.
                 self._update_states(scheduler_output)
 
+                if has_ec_transfer() and get_ec_transfer().is_producer:
+                    with self.maybe_get_ec_connector_output(
+                        scheduler_output,
+                        encoder_cache=self.encoder_cache,
+                    ) as ec_connector_output:
+                        self._execute_mm_encoder(scheduler_output)
+                        return make_empty_encoder_model_runner_output(scheduler_output)
+
                 if not num_scheduled_tokens:
+                    if (
+                        self.parallel_config.distributed_executor_backend
+                        == "external_launcher"
+                        and self.parallel_config.data_parallel_size > 1
+                    ):
+                        # this is a corner case when both external launcher
+                        # and DP are enabled, num_scheduled_tokens could be
+                        # 0, and has_unfinished_requests in the outer loop
+                        # returns True. before returning early here we call
+                        # dummy run to ensure coordinate_batch_across_dp
+                        # is called into to avoid out of sync issues.
+                        self._dummy_run(1)
                     if not has_kv_transfer_group():
                         # Return empty ModelRunnerOutput if no work to do.
                         return EMPTY_MODEL_RUNNER_OUTPUT
@@ -2587,27 +2750,28 @@ def execute_model(
                     )
                 )
 
-            dp_rank = self.parallel_config.data_parallel_rank
-            if ubatch_slices:
-                assert num_tokens_across_dp is not None
-                num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-                self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
-            elif num_tokens_across_dp is not None:
-                num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-            else:
-                num_input_tokens = self._get_num_input_tokens(
-                    scheduler_output.total_num_scheduled_tokens
-                )
+                dp_rank = self.parallel_config.data_parallel_rank
+                if ubatch_slices:
+                    assert num_tokens_across_dp is not None
+                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
+                    self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
+                elif num_tokens_across_dp is not None:
+                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
+                else:
+                    num_input_tokens = self._get_num_input_tokens(
+                        scheduler_output.total_num_scheduled_tokens
+                    )
 
-            (
-                input_ids,
-                inputs_embeds,
-                positions,
-                intermediate_tensors,
-                model_kwargs,
-            ) = self._preprocess(
-                scheduler_output, num_input_tokens, intermediate_tensors
-            )
+                (
+                    input_ids,
+                    inputs_embeds,
+                    positions,
+                    intermediate_tensors,
+                    model_kwargs,
+                    ec_connector_output,
+                ) = self._preprocess(
+                    scheduler_output, num_input_tokens, intermediate_tensors
+                )
 
             uniform_decode = (
                 max_num_scheduled_tokens == self.uniform_decode_query_len
@@ -2625,16 +2789,12 @@ def execute_model(
             )
 
         # Set cudagraph mode to none if calc_kv_scales is true.
-        if attn_metadata is not None:
-            metadata_list = (
-                attn_metadata.values()
-                if isinstance(attn_metadata, dict)
-                else [attn_metadata]
-            )
-            if any(
-                getattr(m, "enable_kv_scales_calculation", False) for m in metadata_list
-            ):
-                cudagraph_runtime_mode = CUDAGraphMode.NONE
+        # KV scales calculation involves dynamic operations that are incompatible
+        # with CUDA graph capture.
+        if self.calculate_kv_scales:
+            cudagraph_runtime_mode = CUDAGraphMode.NONE
+            # Mark KV scales as calculated after the first forward pass
+            self.calculate_kv_scales = False
 
         # Run the model.
         # Use persistent buffers for CUDA graphs.
@@ -2648,7 +2808,7 @@ def execute_model(
                 batch_descriptor=batch_descriptor,
                 ubatch_slices=ubatch_slices,
             ),
-            record_function_or_nullcontext("Forward"),
+            record_function_or_nullcontext("gpu_model_runner: forward"),
             self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
         ):
             model_output = self._model_forward(
@@ -2659,7 +2819,7 @@ def execute_model(
                 **model_kwargs,
             )
 
-        with record_function_or_nullcontext("Postprocess"):
+        with record_function_or_nullcontext("gpu_model_runner: postprocess"):
             if self.use_aux_hidden_state_outputs:
                 # True when EAGLE 3 is used.
                 hidden_states, aux_hidden_states = model_output
@@ -2674,6 +2834,7 @@ def execute_model(
                     # Return the intermediate tensors.
                     assert isinstance(hidden_states, IntermediateTensors)
                     hidden_states.kv_connector_output = kv_connector_output
+                    self.kv_connector_output = kv_connector_output
                     return hidden_states
 
                 if self.is_pooling_model:
@@ -2724,17 +2885,31 @@ def execute_model(
             hidden_states,
             sample_hidden_states,
             aux_hidden_states,
-            kv_connector_output,
+            ec_connector_output,
         )
+        self.kv_connector_output = kv_connector_output
         return None
 
     @torch.inference_mode
     def sample_tokens(
         self, grammar_output: "GrammarOutput | None"
     ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors:
+        kv_connector_output = self.kv_connector_output
+        self.kv_connector_output = None
+
         if self.execute_model_state is None:
             # Nothing to do (PP non-final rank case), output isn't used.
-            return None  # noqa
+            if not kv_connector_output:
+                return None  # noqa
+
+            # In case of PP with kv transfer, we need to pass through the
+            # kv_connector_output
+            if kv_connector_output.is_empty():
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+            output = copy(EMPTY_MODEL_RUNNER_OUTPUT)
+            output.kv_connector_output = kv_connector_output
+            return output
 
         # Unpack ephemeral state.
         (
@@ -2745,7 +2920,7 @@ def sample_tokens(
             hidden_states,
             sample_hidden_states,
             aux_hidden_states,
-            kv_connector_output,
+            ec_connector_output,
         ) = self.execute_model_state
         # Clear ephemeral state.
         self.execute_model_state = None
@@ -2756,12 +2931,16 @@ def sample_tokens(
                 scheduler_output, grammar_output, self.input_batch, logits
             )
 
-        with record_function_or_nullcontext("Sample"):
+        with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
-        def propose_draft_token_ids(sampled_token_ids):
+        self.input_batch.prev_sampled_token_ids = None
+
+        def propose_draft_token_ids(
+            sampled_token_ids: torch.Tensor | list[np.ndarray],
+        ) -> None:
             assert spec_decode_common_attn_metadata is not None
-            with record_function_or_nullcontext("Draft"):
+            with record_function_or_nullcontext("gpu_model_runner: draft"):
                 self._draft_token_ids = self.propose_draft_token_ids(
                     scheduler_output,
                     sampled_token_ids,
@@ -2790,16 +2969,31 @@ def propose_draft_token_ids(sampled_token_ids):
                 self.speculative_config.draft_model_config.max_model_len
             )
         input_fits_in_drafter = spec_decode_common_attn_metadata and (
-            spec_decode_common_attn_metadata.max_seq_len
-            + self.speculative_config.num_speculative_tokens
+            spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
             <= effective_drafter_max_model_len
         )
-        if use_padded_batch_for_eagle and input_fits_in_drafter:
-            # EAGLE speculative decoding can use the GPU sampled tokens
-            # as inputs, and does not need to wait for bookkeeping to finish.
-            propose_draft_token_ids(sampler_output.sampled_token_ids)
+        if use_padded_batch_for_eagle:
+            sampled_token_ids = sampler_output.sampled_token_ids
+            if input_fits_in_drafter:
+                # EAGLE speculative decoding can use the GPU sampled tokens
+                # as inputs, and does not need to wait for bookkeeping to finish.
+                propose_draft_token_ids(sampled_token_ids)
+            elif self.valid_sampled_token_count_event is not None:
+                next_token_ids, valid_sampled_tokens_count = (
+                    self.drafter.prepare_next_token_ids_padded(
+                        spec_decode_common_attn_metadata,
+                        sampled_token_ids,
+                        self.requests,
+                        self.input_batch,
+                        self.discard_request_indices.gpu,
+                        self.num_discarded_requests,
+                    )
+                )
+                self._copy_valid_sampled_token_count(
+                    next_token_ids, valid_sampled_tokens_count
+                )
 
-        with record_function_or_nullcontext("Bookkeep"):
+        with record_function_or_nullcontext("gpu_model_runner: bookkeep"):
             (
                 num_nans_in_logits,
                 logprobs_lists,
@@ -2826,37 +3020,45 @@ def propose_draft_token_ids(sampled_token_ids):
             # tokens on the CPU, so they are run after bookkeeping.
             propose_draft_token_ids(valid_sampled_token_ids)
 
-        with record_function_or_nullcontext("EPLB"):
+        with record_function_or_nullcontext("gpu_model_runner: eplb"):
             self.eplb_step()
-
-        output = ModelRunnerOutput(
-            req_ids=req_ids_output_copy,
-            req_id_to_index=req_id_to_index_output_copy,
-            sampled_token_ids=valid_sampled_token_ids,
-            logprobs=logprobs_lists,
-            prompt_logprobs_dict=prompt_logprobs_dict,
-            pooler_output=[],
-            kv_connector_output=kv_connector_output,
-            num_nans_in_logits=num_nans_in_logits,
-        )
+        with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"):
+            output = ModelRunnerOutput(
+                req_ids=req_ids_output_copy,
+                req_id_to_index=req_id_to_index_output_copy,
+                sampled_token_ids=valid_sampled_token_ids,
+                logprobs=logprobs_lists,
+                prompt_logprobs_dict=prompt_logprobs_dict,
+                pooler_output=[],
+                kv_connector_output=kv_connector_output,
+                ec_connector_output=ec_connector_output
+                if self.supports_mm_inputs
+                else None,
+                num_nans_in_logits=num_nans_in_logits,
+            )
 
         if not self.use_async_scheduling:
             return output
-
-        async_output = AsyncGPUModelRunnerOutput(
-            model_runner_output=output,
-            sampled_token_ids=sampler_output.sampled_token_ids,
-            logprobs_tensors=sampler_output.logprobs_tensors,
-            invalid_req_indices=invalid_req_indices,
-            async_output_copy_stream=self.async_output_copy_stream,
-        )
-
-        # Save ref of sampled_token_ids CPU tensor if the batch contains
-        # any requests with sampling params that that require output ids.
-        self.input_batch.set_async_sampled_token_ids(
-            async_output.sampled_token_ids_cpu,
-            async_output.async_copy_ready_event,
-        )
+        with record_function_or_nullcontext(
+            "gpu_model_runner: AsyncGPUModelRunnerOutput"
+        ):
+            async_output = AsyncGPUModelRunnerOutput(
+                model_runner_output=output,
+                sampled_token_ids=sampler_output.sampled_token_ids,
+                logprobs_tensors=sampler_output.logprobs_tensors,
+                invalid_req_indices=invalid_req_indices,
+                async_output_copy_stream=self.async_output_copy_stream,
+                vocab_size=self.input_batch.vocab_size,
+            )
+        with record_function_or_nullcontext(
+            "gpu_model_runner: set_async_sampled_token_ids"
+        ):
+            # Save ref of sampled_token_ids CPU tensor if the batch contains
+            # any requests with sampling params that require output ids.
+            self.input_batch.set_async_sampled_token_ids(
+                async_output.sampled_token_ids_cpu,
+                async_output.async_copy_ready_event,
+            )
 
         return async_output
 
@@ -2871,17 +3073,48 @@ def take_draft_token_ids(self) -> DraftTokenIds | None:
         self._draft_token_ids = None
         return DraftTokenIds(req_ids, draft_token_ids)
 
+    def _copy_valid_sampled_token_count(
+        self, next_token_ids: torch.Tensor, valid_sampled_tokens_count: torch.Tensor
+    ) -> None:
+        if self.valid_sampled_token_count_event is None:
+            return
+
+        default_stream = torch.cuda.current_stream()
+        # Initialize a new stream to overlap the copy operation with
+        # prepare_input of draft model.
+        with torch.cuda.stream(self.valid_sampled_token_count_copy_stream):
+            self.valid_sampled_token_count_copy_stream.wait_stream(default_stream)  # type: ignore
+            counts = valid_sampled_tokens_count
+            counts_cpu = self.valid_sampled_token_count_cpu
+            counts_cpu[: counts.shape[0]].copy_(counts, non_blocking=True)
+            self.valid_sampled_token_count_event.record()
+
+        self.input_batch.prev_sampled_token_ids = next_token_ids.unsqueeze(1)
+
+    def _get_valid_sampled_token_count(self) -> list[int]:
+        # Wait until valid_sampled_tokens_count is copied to cpu,
+        prev_sampled_token_ids = self.input_batch.prev_sampled_token_ids
+        if (
+            self.valid_sampled_token_count_event is None
+            or prev_sampled_token_ids is None
+        ):
+            return []
+
+        counts_cpu = self.valid_sampled_token_count_cpu
+        self.valid_sampled_token_count_event.synchronize()
+        return counts_cpu[: prev_sampled_token_ids.shape[0]].tolist()
+
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
-        sampled_token_ids: torch.Tensor | list[list[int]],
+        sampled_token_ids: torch.Tensor | list[np.ndarray],
         sampling_metadata: SamplingMetadata,
         hidden_states: torch.Tensor,
         sample_hidden_states: torch.Tensor,
         aux_hidden_states: list[torch.Tensor] | None,
         spec_decode_metadata: SpecDecodeMetadata | None,
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> list[list[int]] | torch.Tensor:
+    ) -> torch.Tensor | list[list[int]]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if self.speculative_config.method == "ngram":
             assert isinstance(sampled_token_ids, list)
@@ -2913,7 +3146,7 @@ def propose_draft_token_ids(
                 for num_draft, tokens in zip(
                     spec_decode_metadata.num_draft_tokens, sampled_token_ids
                 ):
-                    indices.append(offset + len(tokens) - 1)
+                    indices.append(offset + tokens.shape[0] - 1)
                     offset += num_draft + 1
                 indices = torch.tensor(indices, device=self.device)
                 hidden_states = sample_hidden_states[indices]
@@ -2958,6 +3191,9 @@ def propose_draft_token_ids(
                         self.num_discarded_requests,
                     )
                 )
+                self._copy_valid_sampled_token_count(
+                    next_token_ids, valid_sampled_tokens_count
+                )
 
             if spec_decode_metadata is None:
                 token_indices_to_sample = None
@@ -3523,7 +3759,7 @@ def _dummy_run(
                 # TODO(luka) better system for describing dummy batches
                 seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
             else:
-                seq_lens = max_query_len
+                seq_lens = max_query_len  # type: ignore[assignment]
             self.seq_lens.np[:num_reqs] = seq_lens
             self.seq_lens.np[num_reqs:] = 0
             self.seq_lens.copy_to_gpu()
@@ -3816,7 +4052,7 @@ def _dummy_pooler_run(
         supported_pooling_tasks = self.get_supported_pooling_tasks()
 
         if not supported_pooling_tasks:
-            if self.scheduler_config.chunked_prefill_enabled:
+            if self.scheduler_config.enable_chunked_prefill:
                 raise RuntimeError(
                     f"Model {self.model_config.model} does not support "
                     "any pooling tasks with chunked prefill enabled. "
@@ -3880,7 +4116,7 @@ def profile_run(self) -> None:
                     )
 
                     # Run multimodal encoder.
-                    dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                    dummy_encoder_outputs = self.model.embed_multimodal(
                         **batched_dummy_mm_inputs
                     )
 
@@ -4158,14 +4394,16 @@ def create_attn_groups(
             return attn_groups
 
         attention_backend_maps = []
-        attention_backend_set: set[type[AttentionBackend]] = set()
+        attention_backend_list = []
         for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
             attn_backends = get_attn_backends_for_group(kv_cache_group_spec)
             attention_backend_maps.append(attn_backends[0])
-            attention_backend_set.update(attn_backends[1])
+            attention_backend_list.append(attn_backends[1])
 
         # Resolve cudagraph_mode before actually initialize metadata_builders
-        self._check_and_update_cudagraph_mode(attention_backend_set)
+        self._check_and_update_cudagraph_mode(
+            attention_backend_list, kv_cache_config.kv_cache_groups
+        )
 
         for i, attn_backend_map in enumerate(attention_backend_maps):
             self.attn_groups.append(create_attn_groups(attn_backend_map, i))
@@ -4194,22 +4432,31 @@ def initialize_metadata_builders(
         self.calculate_reorder_batch_threshold()
 
     def _check_and_update_cudagraph_mode(
-        self, attention_backends: set[type[AttentionBackend]]
+        self,
+        attention_backends: list[set[type[AttentionBackend]]],
+        kv_cache_groups: list[KVCacheGroupSpec],
     ) -> None:
         """
         Resolve the cudagraph_mode when there are multiple attention
-        backends with potential conflicting CUDA graph support.
+        groups with potential conflicting CUDA graph support.
         Then initialize the cudagraph_dispatcher based on the resolved
         cudagraph_mode.
         """
         min_cg_support = AttentionCGSupport.ALWAYS
         min_cg_backend_name = None
 
-        for attn_backend in attention_backends:
-            builder_cls = attn_backend.get_builder_cls()
-            if builder_cls.cudagraph_support.value < min_cg_support.value:
-                min_cg_support = builder_cls.cudagraph_support
-                min_cg_backend_name = attn_backend.__name__
+        for attn_backend_set, kv_cache_group in zip(
+            attention_backends, kv_cache_groups
+        ):
+            for attn_backend in attn_backend_set:
+                builder_cls = attn_backend.get_builder_cls()
+
+                cg_support = builder_cls.get_cudagraph_support(
+                    self.vllm_config, kv_cache_group.kv_cache_spec
+                )
+                if cg_support.value < min_cg_support.value:
+                    min_cg_support = cg_support
+                    min_cg_backend_name = attn_backend.__name__
         # Flexible resolve the cudagraph mode
         cudagraph_mode = self.compilation_config.cudagraph_mode
         # check cudagraph for mixed batch is supported
@@ -4312,6 +4559,22 @@ def _check_and_update_cudagraph_mode(
                 "and make sure compilation mode is VLLM_COMPILE"
             )
 
+        # if we have dedicated decode cudagraphs, and spec-decode is enabled,
+        # we need to adjust the cudagraph sizes to be a multiple of the uniform
+        # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
+        # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
+        # Will be removed in the near future when we have seperate cudagraph capture
+        # sizes for decode and mixed prefill-decode.
+        if (
+            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+            and cudagraph_mode.separate_routine()
+            and self.uniform_decode_query_len > 1
+        ):
+            self.compilation_config.adjust_cudagraph_sizes_for_spec_decode(
+                self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size
+            )
+            self.cudagraph_batch_sizes = self.compilation_config.cudagraph_capture_sizes
+
         # Trigger cudagraph dispatching keys initialization after
         # resolved cudagraph mode.
         self.cudagraph_dispatcher.initialize_cudagraph_keys(
@@ -4368,7 +4631,7 @@ def block_size_is_supported(
             """
             for backend in backends:
                 is_supported = False
-                for supported_size in backend.get_supported_kernel_block_size():
+                for supported_size in backend.supported_kernel_block_sizes:
                     if isinstance(supported_size, int):
                         if block_size == supported_size:
                             is_supported = True
@@ -4399,7 +4662,7 @@ def block_size_is_supported(
         all_int_supported_sizes = set(
             supported_size
             for backend in backends
-            for supported_size in backend.get_supported_kernel_block_size()
+            for supported_size in backend.supported_kernel_block_sizes
             if isinstance(supported_size, int)
         )
 
@@ -4449,11 +4712,7 @@ def may_reinitialize_input_batch(
                 logitsprocs=self.input_batch.logitsprocs,
                 logitsprocs_need_output_token_ids=self.input_batch.logitsprocs_need_output_token_ids,
                 is_pooling_model=self.is_pooling_model,
-                num_speculative_tokens=(
-                    self.vllm_config.speculative_config.num_speculative_tokens
-                    if self.vllm_config.speculative_config
-                    else 0
-                ),
+                num_speculative_tokens=self.num_spec_tokens,
             )
 
     def _allocate_kv_cache_tensors(
@@ -4819,7 +5078,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
-
+        if has_ec_transfer() and get_ec_transfer().is_producer:
+            return {}
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         attn_layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
         for layer_name, attn_module in attn_layers.items():
@@ -4841,7 +5101,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
 
         return kv_cache_spec
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
         # This is a short term mitigation for issue mentioned in
         # https://github.com/vllm-project/vllm/issues/22754.
         # `tolist` would trigger a cuda wise stream sync, which
@@ -4854,4 +5114,4 @@ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         pinned.copy_(sampled_token_ids, non_blocking=True)
         self.transfer_event.record()
         self.transfer_event.synchronize()
-        return pinned.tolist()
+        return [row for row in pinned.numpy()]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 19061fcffdf1..db942882ea36 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
 
-import copy
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
@@ -14,12 +13,13 @@
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.distributed import (
     ensure_model_parallel_initialized,
     init_distributed_environment,
     set_custom_all_reduce,
 )
+from vllm.distributed.ec_transfer import ensure_ec_transfer_initialized
 from vllm.distributed.kv_transfer import (
     ensure_kv_transfer_initialized,
     get_kv_transfer_group,
@@ -44,7 +44,6 @@
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import (
-    EMPTY_MODEL_RUNNER_OUTPUT,
     AsyncModelRunnerOutput,
     DraftTokenIds,
     ModelRunnerOutput,
@@ -188,6 +187,7 @@ def init_device(self):
                 and self.parallel_config.distributed_executor_backend
                 not in ["ray", "external_launcher"]
                 and self.vllm_config.parallel_config.data_parallel_backend != "ray"
+                and self.vllm_config.parallel_config.nnodes_within_dp == 1
             ):
                 # Use local DP rank if available, otherwise use global DP rank.
                 dp_local_rank = self.parallel_config.data_parallel_rank_local
@@ -204,7 +204,14 @@ def init_device(self):
                 assert self.local_rank < torch.cuda.device_count(), (
                     f"DP adjusted local rank {self.local_rank} is out of bounds. "
                 )
-
+            visible_device_count = (
+                torch.cuda.device_count() if torch.cuda.is_available() else 0
+            )
+            assert self.parallel_config.local_world_size <= visible_device_count, (
+                f"local_world_size ({self.parallel_config.local_world_size}) must be "
+                f"less than or equal to the number of visible devices "
+                f"({visible_device_count})."
+            )
             self.device = torch.device(f"cuda:{self.local_rank}")
             current_platform.set_device(self.device)
 
@@ -401,12 +408,27 @@ def compile_or_warm_up_model(self) -> None:
         # but users still want to compile for better performance,
         # e.g. for the max-num-batched token size in chunked prefill.
         warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
-        if not self.model_config.enforce_eager:
+
+        if (
+            not self.model_config.enforce_eager
+            or self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+        ):
             warmup_sizes = [
                 x
                 for x in warmup_sizes
                 if x not in self.vllm_config.compilation_config.cudagraph_capture_sizes
             ]
+        compile_ranges = self.vllm_config.compilation_config.get_compile_ranges()
+
+        # For each compile_range, if none of the batch sizes
+        # in warmup_sizes or cudagraph_capture_sizes are in the range,
+        # add the end of the range to ensure compilation/warmup.
+        all_sizes = set(self.vllm_config.compilation_config.cudagraph_capture_sizes)
+        all_sizes.update(warmup_sizes)
+        for compile_range in compile_ranges:
+            if not any(x in compile_range for x in all_sizes):
+                warmup_sizes.append(compile_range.end)
+
         # We skip EPLB here since we don't want to record dummy metrics
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
@@ -572,18 +594,7 @@ def execute_model(
             all_gather_tensors=all_gather_tensors,
         )
 
-        kv_connector_output = output.kv_connector_output
-        if not kv_connector_output:
-            return None
-
-        # In case of PP with kv transfer, we need to pass through the
-        # kv_connector_output
-        if kv_connector_output.is_empty():
-            return EMPTY_MODEL_RUNNER_OUTPUT
-
-        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
-        output.kv_connector_output = kv_connector_output
-        return output
+        return None
 
     def take_draft_token_ids(self) -> DraftTokenIds | None:
         return self.model_runner.take_draft_token_ids()
@@ -595,14 +606,19 @@ def profile(self, is_start: bool = True):
             self.profiler.start()
         else:
             self.profiler.stop()
-            # only print profiler results on rank 0
-            if (
-                isinstance(self.profiler, torch.profiler.profile)
-                and self.local_rank == 0
-            ):
-                print(
-                    self.profiler.key_averages().table(sort_by="self_cuda_time_total")
-                )
+            if isinstance(self.profiler, torch.profiler.profile):
+                rank = self.local_rank
+                profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+                profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
+                sort_key = "self_cuda_time_total"
+                table = self.profiler.key_averages().table(sort_by=sort_key)
+
+                with open(profiler_out_file, "w") as f:
+                    print(table, file=f)
+
+                # only print profiler results on rank 0
+                if rank == 0:
+                    print(table)
 
     def execute_dummy_batch(self) -> None:
         self.model_runner._dummy_run(1, uniform_decode=True)
@@ -887,3 +903,7 @@ def init_worker_distributed_environment(
         parallel_config.pipeline_parallel_size,
         parallel_config.decode_context_parallel_size,
     )
+
+    # Init ec connector here before KV caches caches init
+    # NOTE: We do not init KV caches for Encoder-only instance in EPD disagg mode
+    ensure_ec_transfer_initialized(vllm_config)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 26816ce0f209..e9eb7cad38f8 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -21,7 +21,7 @@
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import MLAAttention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
 from vllm.config import (
     ParallelConfig,
     VllmConfig,
@@ -962,7 +962,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             # (feature_size, hidden_size) in case the feature size is dynamic
             # depending on the input multimodal items.
             torch_xla.sync(wait=False)
-            curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
+            curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
             torch_xla.sync(wait=False)
 
             sanity_check_mm_encoder_outputs(
@@ -1065,7 +1065,7 @@ def _get_model_inputs(
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            inputs_embeds = self.model.get_input_embeddings(
+            inputs_embeds = self.model.embed_input_ids(
                 input_ids,
                 multimodal_embeddings=mm_embeds,
                 is_multimodal=is_mm_embed,
@@ -1254,13 +1254,15 @@ def concat_lists(input_lists):
 
         max_gen_len = selected_token_ids.shape[-1]
         if max_gen_len == 1:
-            valid_sampled_token_ids = selected_token_ids.tolist()
+            valid_sampled_token_ids: list[np.ndarray] = [
+                row for row in selected_token_ids.numpy()
+            ]
 
             # Mask out the sampled tokens that should not be sampled.
             # TODO: Keep in sync with gpu_model_runner.py, in particular
             #       the "else" case here
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i].clear()
+                valid_sampled_token_ids[i] = np.array([])
 
             # Append sampled tokens
             for i, req_state, seq_len in request_seq_lens:
@@ -1273,7 +1275,7 @@ def concat_lists(input_lists):
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
             gen_lens = valid_mask.sum(dim=1).tolist()
             valid_sampled_token_ids = [
-                seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens)
+                seq.numpy() for seq in selected_token_ids[valid_mask].split(gen_lens)
             ]
             self.input_batch.num_tokens[:num_reqs] += gen_lens
             for i, req_state, seq_len in request_seq_lens:
@@ -1484,14 +1486,12 @@ def _precompile_mm_encoder(self) -> None:
                 )
                 # Run multimodal encoder.
                 torch_xla.sync(wait=False)
-                mm_embeds = self.model.get_multimodal_embeddings(
-                    **batched_dummy_mm_inputs
-                )
+                mm_embeds = self.model.embed_multimodal(**batched_dummy_mm_inputs)
                 torch_xla.sync(wait=False)
                 num_patches = mm_embeds[0].shape[0]
                 items_size = num_patches * num_items
 
-                # NOTE (NickLucche) pre-compile `get_input_embeddings` when mm
+                # NOTE (NickLucche) pre-compile `embed_input_ids` when mm
                 # embeddings are present. We assume `--disable-mm-chunked`,
                 # hence only whole items can be scheduled. This implies we just
                 # need to compile when `num_items` fit the (padded) `input_ids`
@@ -1519,7 +1519,7 @@ def _precompile_mm_encoder(self) -> None:
                         assert a is None
                         torch_xla.sync(wait=False)
 
-            # Pre-compile `get_input_embeddings` when mm_embeddings are not
+            # Pre-compile `embed_input_ids` when mm_embeddings are not
             # present. Chunk is only made of text, no mm_placeholders.
             for num_tokens in self.num_tokens_paddings:
                 placeholders_ids = torch.zeros(
@@ -1738,7 +1738,7 @@ def profile_run(
                     # impact of recompilation until it's fixed.
                     start = time.perf_counter()
                     torch_xla.sync(wait=False)
-                    dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                    dummy_encoder_outputs = self.model.embed_multimodal(
                         **batched_dummy_mm_inputs
                     )
                     torch_xla.sync(wait=False)
@@ -1897,12 +1897,14 @@ def reset_dynamo_cache(self):
             compiled_model = self.model.get_language_model().model
         else:
             compiled_model = self.model.model
-        if isinstance(compiled_model, TorchCompileWrapperWithCustomDispatcher):
+        if isinstance(compiled_model, TorchCompileWithNoGuardsWrapper):
             logger.info("Clear dynamo cache and cached dynamo bytecode.")
             torch._dynamo.eval_frame.remove_from_cache(
-                compiled_model.original_code_object
+                compiled_model.original_code_object()
             )
-            compiled_model.compiled_codes.clear()
+            # Reset the wrapper to re-initialize.
+            compiled_model.compiled = False
+            TorchCompileWithNoGuardsWrapper.__init__(compiled_model)
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def select_hidden_states(self, hidden_states, indices_do_sample):
@@ -1974,11 +1976,11 @@ def apply_grammar_bitmask(
             )
         return logits_cloned
 
-    def get_multimodal_embeddings(self, *args, **kwargs):
-        return self.model.get_multimodal_embeddings(*args, **kwargs)
+    def embed_multimodal(self, *args, **kwargs):
+        return self.model.embed_multimodal(*args, **kwargs)
 
-    def get_input_embeddings(self, *args, **kwargs):
-        return self.model.get_input_embeddings(*args, **kwargs)
+    def embed_input_ids(self, *args, **kwargs):
+        return self.model.embed_input_ids(*args, **kwargs)
 
     def prepare_structured_decoding_input(
         self, logits: torch.Tensor, grammar_output: "GrammarOutput"
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 0ca7e81a5c7b..5feb42ddb492 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -177,27 +177,27 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
     """
     Perform sanity checks for the result of
-    [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
+    [`vllm.model_executor.models.SupportsMultiModal.embed_multimodal`][].
     """
     assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
         "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
         f"or a single 3D tensor, but got {type(mm_embeddings)} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method."
+        "of the model's `embed_multimodal` method."
     )
 
     assert len(mm_embeddings) == expected_num_items, (
         "Expected number of multimodal embeddings to match number of "
         f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method."
+        "of the model's `embed_multimodal` method."
     )
 
     assert all(e.ndim == 2 for e in mm_embeddings), (
         "Expected multimodal embeddings to be a sequence of 2D tensors, "
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method."
+        "of the model's `embed_multimodal` method."
     )
 
 
@@ -340,7 +340,7 @@ def is_residual_scattered_for_sp(
     The residual tensor is scattered across tensor parallel ranks when sequence
     parallelism and tensor parallelism is enabled.
 
-    This follows the same logic as SequenceParallelismPass.is_applicable():
+    This follows the same logic as SequenceParallelismPass.is_applicable_for_range():
     - In full-graph compilation mode (no splitting ops or using inductor graph
       partition), SP is always applied
     - Otherwise, SP is only applied for specific shapes in compile_sizes
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 30ea0ab77bd9..16f321c08077 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -13,7 +13,6 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import worker_receiver_cache_from_config
-from vllm.utils import warn_for_unimplemented_methods
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.system_utils import update_environment_variables
 from vllm.v1.kv_cache_interface import KVCacheSpec
@@ -33,7 +32,6 @@
 _R = TypeVar("_R")
 
 
-@warn_for_unimplemented_methods
 class WorkerBase:
     """Worker interface that allows vLLM to cleanly separate implementations for
     different hardware. Also abstracts control plane communication, e.g., to
@@ -182,6 +180,7 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         rpc_rank: int = 0,
+        global_rank: int | None = None,
     ) -> None:
         """
         Initialize the worker wrapper with the given vllm_config and rpc_rank.
@@ -194,6 +193,7 @@ def __init__(
         group.
         """
         self.rpc_rank = rpc_rank
+        self.global_rank = self.rpc_rank if global_rank is None else global_rank
         self.worker: WorkerBase | None = None
 
         # do not store this `vllm_config`, `init_worker` will set the final
@@ -314,7 +314,7 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None:
             assert self.worker is not None
 
     def initialize_from_config(self, kv_cache_configs: list[Any]) -> None:
-        kv_cache_config = kv_cache_configs[self.rpc_rank]
+        kv_cache_config = kv_cache_configs[self.global_rank]
         with set_current_vllm_config(self.vllm_config):
             self.worker.initialize_from_config(kv_cache_config)  # type: ignore
 
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 31fa3f3bd6ac..26c6f8d06bdc 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -159,12 +159,10 @@ def init_device(self):
         else:
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
-        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd")
         ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
         ENV_LOCAL_WORLD_SIZE = os.getenv(
             "LOCAL_WORLD_SIZE", str(self.parallel_config.world_size)
         )
-        os.environ["CCL_ZE_IPC_EXCHANGE"] = ENV_CCL_ZE_IPC_EXCHANGE
         os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT
         os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE
         os.environ["LOCAL_RANK"] = str(self.local_rank)