diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 12f730738b8a..38c400ba1faf 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -132,7 +132,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 58fd435691f4..864eb470bb0a 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -59,7 +59,7 @@ while true; do
fi
done
-echo "--- Pulling container"
+echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"
@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
commands=$@
echo "Commands:$commands"
-if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
- commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
-if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
- commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
-fi
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
@@ -181,13 +177,13 @@ if [[ -z "$render_gid" ]]; then
exit 1
fi
-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
- # assign job count as the number of shards used
- commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+ # assign job count as the number of shards used
+ commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
# assign shard-id for each shard
- commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+ commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 7927aef19e4e..7479c43977d7 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -49,6 +49,7 @@ function cpu_tests() {
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
+ pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
@@ -76,7 +77,7 @@ function cpu_tests() {
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
- # VLLM_USE_V1=0 pytest -x -s -v \
+ # pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests
@@ -116,4 +117,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 27ed67c4517e..d49f3e2f47cf 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -46,6 +46,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
- pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index bb5ef5d62463..2471b509a9ff 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -226,6 +226,27 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
+- label: Distributed Tests (8 GPUs) # 4min
+ timeout_in_minutes: 10
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ gpu: h100
+ num_gpus: 8
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - examples/offline_inference/torchrun_dp_example.py
+ - vllm/config/parallel.py
+ - vllm/distributed/
+ - vllm/v1/engine/llm_engine.py
+ - vllm/v1/executor/uniproc_executor.py
+ - vllm/v1/worker/gpu_worker.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ #- export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and dp=4 with ep
+ - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
- label: EPLB Algorithm Test # 5min
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
@@ -238,11 +259,11 @@ steps:
commands:
- pytest -v -s distributed/test_eplb_algo.py
-- label: EPLB Execution Test # 5min
+- label: EPLB Execution Test # 10min
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
- timeout_in_minutes: 15
+ timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
@@ -250,6 +271,7 @@ steps:
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
+ - pytest -v -s distributed/test_eplb_spec_decode.py
- label: Metrics, Tracing Test # 12min
timeout_in_minutes: 20
@@ -273,7 +295,7 @@ steps:
- label: Regression Test # 7min
timeout_in_minutes: 20
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
grade: Blocking
source_file_dependencies:
@@ -288,7 +310,7 @@ steps:
timeout_in_minutes: 40
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
- #grade: Blocking
+ # grade: Blocking
source_file_dependencies:
- vllm/
- tests/engine
@@ -337,6 +359,7 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
@@ -344,14 +367,29 @@ steps:
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/spec_decode
- - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py
+ - pytest -v -s v1/test_outputs.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+# TODO: Add the "V1 Test attetion (MI300)" test group
+
+- label: V1 Test attention (H100) # 10min
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ timeout_in_minutes: 30
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - pytest -v -s v1/attention
+
- label: V1 Test others (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
@@ -478,10 +516,11 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
+ - pytest -v -s compile/test_multimodal_compile.py
- pytest -v -s compile/piecewise/
-- label: PyTorch Fullgraph Test # 22min
- timeout_in_minutes: 35
+- label: PyTorch Fullgraph Test # 27min
+ timeout_in_minutes: 40
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
@@ -490,8 +529,23 @@ steps:
- vllm/
- tests/compile
commands:
- - pytest -v -s compile/test_full_graph.py
- - pytest -v -s compile/test_fusions_e2e.py
+ - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+ # Limit to no custom ops to reduce running time
+ # Wrap with quotes to escape yaml and avoid starting -k string with a -
+ - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+
+- label: Cudagraph test
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ source_file_dependencies:
+ - tests/v1/cudagraph
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/config/compilation.py
+ - vllm/compilation
+ commands:
+ - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+ - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
- label: Kernels Core Operation Test # 48min
timeout_in_minutes: 75
@@ -543,6 +597,8 @@ steps:
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/
+ - vllm/envs.py
+ - vllm/config
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
@@ -561,10 +617,13 @@ steps:
- label: Model Executor Test # 23min
timeout_in_minutes: 35
+ torch_nightly: true
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
+ - vllm/engine/arg_utils.py
+ - vllm/config/model.py
- vllm/model_executor
- tests/model_executor
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -860,9 +919,10 @@ steps:
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
+ timeout_in_minutes: 70
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
- timeout_in_minutes: 15
+ # grade: Blocking
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
@@ -933,6 +993,7 @@ steps:
- label: Transformers Nightly Models Test
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
+ # grade: Blocking
working_dir: "/vllm-workspace/"
optional: true
commands:
@@ -960,11 +1021,16 @@ steps:
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/attention/backends/mla/cutlass_mla.py
+ - vllm/v1/attention/backends/mla/flashinfer_mla.py
+ - vllm/platforms/cuda.py
+ - vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -1001,12 +1067,39 @@ steps:
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/test_fusion_all_reduce.py
+ # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+ # Wrap with quotes to escape yaml
+ - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+
+- label: Blackwell Fusion E2E Tests # 30 min
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/test_fusions_e2e.py
+ - tests/compile/test_full_graph.py
+ commands:
+ - nvidia-smi
+ # Run all e2e fusion tests
- pytest -v -s tests/compile/test_fusions_e2e.py
+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+ - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
-- label: Blackwell GPT-OSS Eval
+- label: ROCm GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
- gpu: b200
+ agent_pool: mi325_1
+ mirror_hardwares: [amdproduction]
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
@@ -1015,7 +1108,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+ - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
@@ -1252,6 +1345,7 @@ steps:
- label: NixlConnector PD accuracy tests (Distributed) # 30min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
+ # grade: Blocking
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
@@ -1266,6 +1360,9 @@ steps:
##### A100 test #####
- label: Distributed Tests (A100) # optional
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
gpu: a100
optional: true
num_gpus: 4
@@ -1280,6 +1377,9 @@ steps:
- pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
gpu: a100
optional: true
num_gpus: 4
@@ -1291,8 +1391,27 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
##### H200 test #####
- label: Distributed Tests (H200) # optional
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
@@ -1304,6 +1423,7 @@ steps:
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
- label: Distributed Tests (B200) # optional
@@ -1314,6 +1434,7 @@ steps:
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+ - pytest -v -s tests/v1/distributed/test_dbo.py
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
@@ -1329,3 +1450,27 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3152cd6488f3..7396a3875c2b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -25,6 +25,7 @@
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.
# When adding a test
# - If the test belongs to an existing group, add it there
@@ -56,7 +57,7 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
@@ -65,6 +66,7 @@ steps:
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/transformers_utils
+ - tests/config
no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
@@ -72,6 +74,7 @@ steps:
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s transformers_utils
+ - pytest -v -s config
- label: Python-only Installation Test # 10min
timeout_in_minutes: 20
@@ -329,6 +332,7 @@ steps:
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py
+ - pytest -v -s v1/test_outputs.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -441,6 +445,8 @@ steps:
- vllm/
- tests/compile
commands:
+ - pytest -v -s compile/test_graph_partition.py
+ - pytest -v -s compile/test_config.py
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
@@ -450,6 +456,8 @@ steps:
- pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py
- pytest -v -s compile/test_aot_compile.py
+ - pytest -v -s compile/test_qk_norm_rope_fusion.py
+ - pytest -v -s compile/test_compile_ranges.py
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@@ -471,10 +479,11 @@ steps:
- vllm/
- tests/compile
commands:
- - pytest -v -s compile/test_full_graph.py
- # Limit to no custom ops to reduce running time
+ # fp8 kv scales not supported on sm89, tested on Blackwell instead
+ - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+ # Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+ - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
- label: Cudagraph test
timeout_in_minutes: 20
@@ -604,6 +613,7 @@ steps:
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
+ autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
@@ -867,12 +877,12 @@ steps:
optional: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- - pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
- pytest -v -s tests/models/test_transformers.py
- - pytest -v -s tests/models/multimodal/processing/
- - pytest -v -s tests/models/multimodal/test_mapping.py
+ # - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
- python3 examples/offline_inference/basic/chat.py
- - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -890,11 +900,16 @@ steps:
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/attention/backends/mla/cutlass_mla.py
+ - vllm/v1/attention/backends/mla/flashinfer_mla.py
+ - vllm/platforms/cuda.py
+ - vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -912,7 +927,7 @@ steps:
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
-- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
@@ -932,8 +947,10 @@ steps:
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
- # Wrap with quotes to escape yaml
- - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+ # Wrap with quotes to escape yaml
+ - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+ - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
@@ -951,6 +968,7 @@ steps:
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusions_e2e.py
+ - tests/compile/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
@@ -1250,7 +1268,8 @@ steps:
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+ - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
+ - pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 23def076cf88..6e178bb690c5 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,8 +3,8 @@
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
@@ -20,15 +20,15 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
+/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
@@ -36,11 +36,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/v1/offloading @ApostaC
# Test ownership
-/.buildkite/lm-eval-harness @mgoin @simon-mo
+/.buildkite/lm-eval-harness @mgoin
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
/tests/evals @mgoin
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
@@ -49,7 +49,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
@@ -57,10 +57,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
-# Transformers backend
+# Transformers modeling backend
/vllm/model_executor/models/transformers @hmellor
/tests/models/test_transformers.py @hmellor
+# Observability
+/vllm/config/observability.py @markmc
+/vllm/v1/metrics @markmc
+/tests/v1/metrics @markmc
+/vllm/tracing.py @markmc
+/tests/v1/tracing/test_tracing.py @markmc
+/vllm/config/kv_events.py @markmc
+/vllm/distributed/kv_events.py @markmc
+/tests/distributed/test_events.py @markmc
+
# Docs
/docs/mkdocs @hmellor
/docs/**/*.yml @hmellor
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 18d4a2e83144..997a40e18e58 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -151,6 +151,23 @@ pull_request_rules:
add:
- gpt-oss
+- name: label-nvidia
+ description: Automatically apply nvidia label
+ conditions:
+ - label != stale
+ - or:
+ - files~=cuda
+ - files~=cutlass
+ - files~=flashinfer
+ - files~=trtllm
+ - title~=(?i)NVIDIA
+ - title~=(?i)CUDA
+ - title~=(?i)CUTLASS
+ actions:
+ label:
+ add:
+ - nvidia
+
- name: label-rocm
description: Automatically apply rocm label
conditions:
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
new file mode 100644
index 000000000000..42b05ecd5ac0
--- /dev/null
+++ b/.github/workflows/macos-smoke-test.yml
@@ -0,0 +1,81 @@
+name: macOS Apple Silicon Smoke Test
+
+on:
+ push:
+ branches:
+ - main
+ workflow_dispatch: # Manual trigger
+
+jobs:
+ macos-m1-smoke-test:
+ runs-on: macos-latest
+ timeout-minutes: 20
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - uses: astral-sh/setup-uv@v7
+ with:
+ enable-cache: true
+ cache-dependency-glob: |
+ requirements/**/*.txt
+ pyproject.toml
+ python-version: '3.12'
+
+ - name: Create virtual environment
+ run: |
+ uv venv
+ echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
+
+ - name: Install dependencies and build vLLM
+ run: |
+ uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
+ uv pip install -e .
+ env:
+ CMAKE_BUILD_PARALLEL_LEVEL: 4
+
+ - name: Verify installation
+ run: |
+ python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+ python -c "import torch; print(f'PyTorch: {torch.__version__}')"
+
+ - name: Smoke test vllm serve
+ timeout-minutes: 10
+ run: |
+ # Start server in background
+ vllm serve Qwen/Qwen3-0.6B \
+ --max-model-len=2048 \
+ --load-format=dummy \
+ --enforce-eager \
+ --port 8000 &
+
+ SERVER_PID=$!
+
+ # Wait for server to start
+ for i in {1..30}; do
+ if curl -s http://localhost:8000/health > /dev/null; then
+ echo "Server started successfully"
+ break
+ fi
+ if [ "$i" -eq 30 ]; then
+ echo "Server failed to start"
+ kill "$SERVER_PID"
+ exit 1
+ fi
+ sleep 2
+ done
+
+ # Test health endpoint
+ curl -f http://localhost:8000/health
+
+ # Test completion
+ curl -f http://localhost:8000/v1/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "Qwen/Qwen3-0.6B",
+ "prompt": "Hello",
+ "max_tokens": 5
+ }'
+
+ # Cleanup
+ kill "$SERVER_PID"
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
index cd9df57cd980..937487f47364 100644
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -3,10 +3,9 @@ MD007:
MD013: false
MD024:
siblings_only: true
+MD031:
+ list_items: false
MD033: false
-MD045: false
MD046: false
-MD051: false
MD052: false
-MD053: false
MD059: false
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e9fa63b178e..3a37040edbf1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,13 @@ set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+# ROCm installation prefix. Default to /opt/rocm but allow override via
+# -DROCM_PATH=/your/rocm/path when invoking cmake.
+if(NOT DEFINED ROCM_PATH)
+ set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
+else()
+ set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
+endif()
#
# Supported/expected torch versions for CUDA/ROCm.
#
@@ -237,10 +244,27 @@ set_gencode_flags_for_srcs(
SRCS "${VLLM_CUMEM_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
-if(VLLM_GPU_LANG STREQUAL "CUDA")
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling cumem allocator extension.")
- # link against cuda driver library
- list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+ if(VLLM_GPU_LANG STREQUAL "CUDA")
+ # link against cuda driver library
+ list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+ else()
+ # link against rocm driver library. Prefer an absolute path to
+ # libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
+ # back to linking by name "amdhip64".
+ find_library(AMDHIP64_LIB
+ NAMES amdhip64 libamdhip64.so
+ PATHS ${ROCM_PATH}/lib
+ NO_DEFAULT_PATH)
+ if(AMDHIP64_LIB)
+ message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
+ list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
+ else()
+ message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
+ list(APPEND CUMEM_LIBS amdhip64)
+ endif()
+ endif()
define_extension_target(
cumem_allocator
DESTINATION vllm
@@ -265,6 +289,7 @@ set(VLLM_EXT_SRC
"csrc/pos_encoding_kernels.cu"
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
+ "csrc/fused_qknorm_rope_kernel.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/cuda_view.cu"
@@ -330,7 +355,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX
- cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+ cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)
#
@@ -836,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# Hadacore kernels
- cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+ cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
if(HADACORE_ARCHS)
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
set_gencode_flags_for_srcs(
@@ -914,7 +939,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX
- cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+ cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
#
diff --git a/README.md b/README.md
index b5e230e4b9b0..033e1035d891 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥
+- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py
new file mode 100755
index 000000000000..b5c16c42de46
--- /dev/null
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
+
+This benchmark runs the same workload twice:
+1. With VLLM_BATCH_INVARIANT=0 (baseline)
+2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
+
+And reports the timing and throughput metrics for comparison.
+
+Environment variables:
+ VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
+ VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
+ VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
+ VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
+ VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
+ VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
+ VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
+ VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
+ VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
+ VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
+ VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
+
+Example usage:
+ # Benchmark qwen3 (default)
+ python benchmarks/benchmark_batch_invariance.py
+
+ # Benchmark deepseek with 8 GPUs
+ VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
+ python benchmarks/benchmark_batch_invariance.py
+
+ # Quick test with fewer trials
+ VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
+ python benchmarks/benchmark_batch_invariance.py
+"""
+
+import contextlib
+import os
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+ """Generate a random prompt for benchmarking."""
+ prompt_templates = [
+ "Question: What is the capital of France?\nAnswer: The capital of France is",
+ "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+ "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+ "Once upon a time in a distant galaxy, there lived",
+ "The old man walked slowly down the street, remembering",
+ "In the year 2157, humanity finally discovered",
+ "To implement a binary search tree in Python, first we need to",
+ "The algorithm works by iterating through the array and",
+ "Here's how to optimize database queries using indexing:",
+ "The Renaissance was a period in European history that",
+ "Climate change is caused by several factors including",
+ "The human brain contains approximately 86 billion neurons which",
+ "I've been thinking about getting a new laptop because",
+ "Yesterday I went to the store and bought",
+ "My favorite thing about summer is definitely",
+ ]
+
+ base_prompt = random.choice(prompt_templates)
+
+ if max_words < min_words:
+ max_words = min_words
+ target_words = random.randint(min_words, max_words)
+
+ if target_words > 50:
+ padding_text = (
+ " This is an interesting topic that deserves more explanation. "
+ * (target_words // 50)
+ )
+ base_prompt = base_prompt + padding_text
+
+ return base_prompt
+
+
+def run_benchmark_with_batch_invariant(
+ model: str,
+ tp_size: int,
+ max_batch_size: int,
+ num_trials: int,
+ min_prompt: int,
+ max_prompt: int,
+ max_tokens: int,
+ temperature: float,
+ gpu_mem_util: float,
+ max_model_len: int,
+ backend: str,
+ batch_invariant: bool,
+ seed: int = 12345,
+) -> dict:
+ """
+ Run the benchmark with the specified configuration.
+
+ Returns a dict with timing and throughput metrics.
+ """
+ random.seed(seed)
+
+ # Set environment variables
+ os.environ["VLLM_ATTENTION_BACKEND"] = backend
+ if batch_invariant:
+ os.environ["VLLM_BATCH_INVARIANT"] = "1"
+ else:
+ os.environ["VLLM_BATCH_INVARIANT"] = "0"
+
+ print(f"\n{'=' * 80}")
+ print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
+ print(f" Model: {model}")
+ print(f" TP Size: {tp_size}")
+ print(f" Backend: {backend}")
+ print(f" Max Batch Size: {max_batch_size}")
+ print(f" Trials: {num_trials}")
+ print(f" Max Tokens: {max_tokens}")
+ print(f"{'=' * 80}\n")
+
+ sampling = SamplingParams(
+ temperature=temperature,
+ top_p=0.95,
+ max_tokens=max_tokens,
+ seed=20240919,
+ )
+
+ needle_prompt = "There once was a "
+
+ llm = None
+ try:
+ # Create LLM engine
+ start_init = time.perf_counter()
+ llm = LLM(
+ model=model,
+ max_num_seqs=max_batch_size,
+ gpu_memory_utilization=gpu_mem_util,
+ max_model_len=max_model_len,
+ dtype="bfloat16",
+ tensor_parallel_size=tp_size,
+ enable_prefix_caching=False,
+ )
+ init_time = time.perf_counter() - start_init
+ print(f"Engine initialization time: {init_time:.2f}s\n")
+
+ # Generate baseline
+ print("Generating baseline (warmup)...")
+ baseline_out = llm.generate([needle_prompt], sampling)
+ assert len(baseline_out) == 1
+ baseline_text = baseline_out[0].outputs[0].text
+ print(f"Baseline output: '{baseline_text[:50]}...'\n")
+
+ # Run trials and measure timing
+ trial_times: list[float] = []
+ total_tokens = 0
+ total_prompts = 0
+
+ for trial in range(num_trials):
+ # Create a batch
+ prompts: list[str] = []
+ batch_size = random.randint(max_batch_size // 2, max_batch_size)
+ needle_pos = random.randint(0, batch_size - 1)
+ for i in range(batch_size):
+ if i == needle_pos:
+ prompts.append(needle_prompt)
+ else:
+ prompts.append(_random_prompt(min_prompt, max_prompt))
+
+ # Measure time for this trial
+ start_time = time.perf_counter()
+ outputs = llm.generate(prompts, sampling)
+ trial_time = time.perf_counter() - start_time
+
+ trial_times.append(trial_time)
+ total_prompts += len(prompts)
+
+ # Count tokens
+ for output in outputs:
+ if output.outputs:
+ total_tokens += len(output.outputs[0].token_ids)
+
+ print(
+ f"Trial {trial + 1}/{num_trials}: "
+ f"batch_size={batch_size}, "
+ f"time={trial_time:.2f}s"
+ )
+
+ # Verify needle output still matches
+ needle_output = outputs[needle_pos]
+ assert needle_output.prompt == needle_prompt
+
+ # Compute statistics
+ avg_time = sum(trial_times) / len(trial_times)
+ min_time = min(trial_times)
+ max_time = max(trial_times)
+ throughput = total_tokens / sum(trial_times)
+ prompts_per_sec = total_prompts / sum(trial_times)
+
+ print(f"\n{'=' * 80}")
+ print("RESULTS:")
+ print(f" Average time per trial: {avg_time:.2f}s")
+ print(f" Min time: {min_time:.2f}s")
+ print(f" Max time: {max_time:.2f}s")
+ print(f" Total tokens generated: {total_tokens}")
+ print(f" Total prompts processed: {total_prompts}")
+ print(f" Throughput: {throughput:.2f} tokens/s")
+ print(f" Prompts/s: {prompts_per_sec:.2f}")
+ print(f"{'=' * 80}\n")
+
+ return {
+ "init_time": init_time,
+ "avg_time": avg_time,
+ "min_time": min_time,
+ "max_time": max_time,
+ "total_tokens": total_tokens,
+ "total_prompts": total_prompts,
+ "throughput": throughput,
+ "prompts_per_sec": prompts_per_sec,
+ "trial_times": trial_times,
+ }
+
+ finally:
+ # Cleanup
+ if llm is not None:
+ with contextlib.suppress(Exception):
+ llm.shutdown()
+
+
+def main():
+ # Check platform support
+ if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
+ print("ERROR: Requires CUDA and >= Hopper (SM90)")
+ print(f"Current platform: {current_platform.device_type}")
+ if current_platform.is_cuda():
+ print(f"Device capability: {current_platform.get_device_capability()}")
+ return 1
+
+ # Read configuration from environment
+ model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
+ tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
+ max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
+ num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
+ min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
+ max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
+ max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
+ temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
+ gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
+ max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
+ backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
+
+ print("\n" + "=" * 80)
+ print("VLLM BATCH INVARIANCE BENCHMARK")
+ print("=" * 80)
+ print("\nConfiguration:")
+ print(f" Model: {model}")
+ print(f" Tensor Parallel Size: {tp_size}")
+ print(f" Attention Backend: {backend}")
+ print(f" Max Batch Size: {max_batch_size}")
+ print(f" Number of Trials: {num_trials}")
+ print(f" Prompt Length Range: {min_prompt}-{max_prompt} words")
+ print(f" Max Tokens to Generate: {max_tokens}")
+ print(f" Temperature: {temperature}")
+ print(f" GPU Memory Utilization: {gpu_mem_util}")
+ print(f" Max Model Length: {max_model_len}")
+ print("=" * 80)
+
+ # Run benchmark WITHOUT batch invariance (baseline)
+ print("\n" + "=" * 80)
+ print("PHASE 1: Running WITHOUT batch invariance (baseline)")
+ print("=" * 80)
+ baseline_results = run_benchmark_with_batch_invariant(
+ model=model,
+ tp_size=tp_size,
+ max_batch_size=max_batch_size,
+ num_trials=num_trials,
+ min_prompt=min_prompt,
+ max_prompt=max_prompt,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ gpu_mem_util=gpu_mem_util,
+ max_model_len=max_model_len,
+ backend=backend,
+ batch_invariant=False,
+ )
+
+ # Run benchmark WITH batch invariance
+ print("\n" + "=" * 80)
+ print("PHASE 2: Running WITH batch invariance")
+ print("=" * 80)
+ batch_inv_results = run_benchmark_with_batch_invariant(
+ model=model,
+ tp_size=tp_size,
+ max_batch_size=max_batch_size,
+ num_trials=num_trials,
+ min_prompt=min_prompt,
+ max_prompt=max_prompt,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ gpu_mem_util=gpu_mem_util,
+ max_model_len=max_model_len,
+ backend=backend,
+ batch_invariant=True,
+ )
+
+ # Compare results
+ print("\n" + "=" * 80)
+ print("COMPARISON: Batch Invariance vs Baseline")
+ print("=" * 80)
+
+ init_overhead_pct = (
+ (batch_inv_results["init_time"] - baseline_results["init_time"])
+ / baseline_results["init_time"]
+ * 100
+ )
+ time_overhead_pct = (
+ (batch_inv_results["avg_time"] - baseline_results["avg_time"])
+ / baseline_results["avg_time"]
+ * 100
+ )
+ throughput_change_pct = (
+ (batch_inv_results["throughput"] - baseline_results["throughput"])
+ / baseline_results["throughput"]
+ * 100
+ )
+
+ print("\nInitialization Time:")
+ print(f" Baseline: {baseline_results['init_time']:.2f}s")
+ print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s")
+ print(f" Overhead: {init_overhead_pct:+.2f}%")
+
+ print("\nAverage Trial Time:")
+ print(f" Baseline: {baseline_results['avg_time']:.2f}s")
+ print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s")
+ print(f" Overhead: {time_overhead_pct:+.2f}%")
+
+ print("\nThroughput (tokens/s):")
+ print(f" Baseline: {baseline_results['throughput']:.2f}")
+ print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}")
+ print(f" Change: {throughput_change_pct:+.2f}%")
+
+ print("\nPrompts/s:")
+ print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}")
+ print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}")
+
+ print("\n" + "=" * 80)
+ print("SUMMARY")
+ print("=" * 80)
+ if time_overhead_pct > 0:
+ print(
+ f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
+ "overhead"
+ )
+ else:
+ print(
+ f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
+ "faster (unexpected!)"
+ )
+
+ if abs(throughput_change_pct) < 1.0:
+ print("Throughput difference is negligible (< 1%)")
+ elif throughput_change_pct < 0:
+ print(
+ f"Throughput decreased by {-throughput_change_pct:.1f}% "
+ "with batch invariance"
+ )
+ else:
+ print(
+ f"Throughput increased by {throughput_change_pct:.1f}% "
+ "with batch invariance (unexpected!)"
+ )
+
+ print("=" * 80 + "\n")
+
+ return 0
+
+
+if __name__ == "__main__":
+ exit(main())
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 146c268a6b7f..28fc383a318d 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
# Remove the special tokens.
return random.choices(
- [v for k, v in vocab.items() if k not in all_special_ids],
+ [v for v in vocab.values() if v not in all_special_ids],
k=length,
)
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index f1e504499eaf..11e3ac7f0c1f 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -1,10 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+# Disable DeepGEMM for this benchmark to use CUTLASS
+os.environ["VLLM_USE_DEEP_GEMM"] = "0"
+
import torch
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- apply_w8a8_block_fp8_linear,
+ W8A8BlockFp8LinearOp,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+ GroupShape,
)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
CUTLASS_BLOCK_FP8_SUPPORTED,
@@ -39,13 +47,14 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
- # Create random FP8 tensors
+ # Create random input tensor (bfloat16, will be quantized by W8A8BlockFp8LinearOp)
A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+ # Create quantized weight tensor
B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
- # Create scales
+ # Create weight scales
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
@@ -55,19 +64,25 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
* factor_for_scale
)
- # SM90 CUTLASS requires row-major format for scales
- if use_cutlass and current_platform.is_device_capability(90):
- Bs = Bs.T.contiguous()
+ # Create W8A8BlockFp8LinearOp instance
+ weight_group_shape = GroupShape(block_n, block_k)
+ act_quant_group_shape = GroupShape(1, block_k) # Per-token, per-group quantization
+
+ linear_op = W8A8BlockFp8LinearOp(
+ weight_group_shape=weight_group_shape,
+ act_quant_group_shape=act_quant_group_shape,
+ cutlass_block_fp8_supported=use_cutlass,
+ use_aiter_and_is_supported=False,
+ )
def run():
- if use_cutlass:
- return apply_w8a8_block_fp8_linear(
- A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
- )
- else:
- return apply_w8a8_block_fp8_linear(
- A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
- )
+ return linear_op.apply(
+ input=A_ref,
+ weight=B,
+ weight_scale=Bs,
+ input_scale=None,
+ bias=None,
+ )
return run
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 29ef6409bb16..074b7a440b61 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,97 +1,76 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from itertools import accumulate
+import itertools
-import nvtx
import torch
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
-from vllm.platforms import current_platform
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
+batch_size_range = [2**i for i in range(0, 8, 2)]
+seq_len_range = [2**i for i in range(6, 10, 1)]
+num_heads_range = [32, 48]
+configs = list(itertools.product(batch_size_range, seq_len_range, num_heads_range))
-def benchmark_rope_kernels_multi_lora(
- is_neox_style: bool,
- batch_size: int,
- seq_len: int,
- num_heads: int,
- head_size: int,
- rotary_dim: int | None,
- dtype: torch.dtype,
- seed: int,
- device: str,
- max_position: int = 8192,
- base: float = 10000,
-) -> None:
- current_platform.seed_everything(seed)
- torch.set_default_device(device)
- if rotary_dim is None:
- rotary_dim = head_size
- # silulating serving 4 LoRAs
- scaling_factors = [1, 2, 4, 8]
- # batched RoPE can take multiple scaling factors
- batched_rope = get_rope(
- head_size,
- rotary_dim,
- max_position,
- base,
- is_neox_style,
- {"rope_type": "linear", "factor": tuple(scaling_factors)},
+
+def get_benchmark(head_size, rotary_dim, is_neox_style, device):
+ @triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size", "seq_len", "num_heads"],
+ x_vals=[list(_) for _ in configs],
+ line_arg="provider",
+ line_vals=["torch", "flashinfer", "vllm"],
+ line_names=["PyTorch", "FlashInfer", "vLLM"],
+ styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+ ylabel="us",
+ plot_name=f"rope-perf{'-neox-style' if is_neox_style else ''}",
+ args={},
+ )
)
- # non-batched RoPE takes only one scaling factor, we create multiple
- # instances to simulate the same behavior
- non_batched_ropes: list[RotaryEmbedding] = []
- for scaling_factor in scaling_factors:
- non_batched_ropes.append(
- get_rope(
- head_size,
- rotary_dim,
- max_position,
- base,
- is_neox_style,
- {"rope_type": "linear", "factor": (scaling_factor,)},
- )
+ def benchmark(batch_size, seq_len, num_heads, provider):
+ dtype = torch.bfloat16
+ max_position = 8192
+ base = 10000
+ rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+ rope = rope.to(dtype=dtype, device=device)
+ cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
+
+ positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
+ query = torch.randn(
+ (batch_size, seq_len, num_heads * head_size), dtype=dtype, device=device
)
+ key = torch.randn_like(query)
- positions = torch.randint(0, max_position, (batch_size, seq_len))
- query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
- key = torch.randn_like(query)
+ quantiles = [0.5, 0.2, 0.8]
- # create query offsets for batched RoPE, we concat multiple kv cache
- # together and each query needs to find the right kv cache of its type
- offset_map = torch.tensor(
- list(
- accumulate(
- [0]
- + [
- max_position * scaling_factor * 2
- for scaling_factor in scaling_factors[:-1]
- ]
+ if provider == "torch":
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: rope.forward_native(positions, query.clone(), key.clone()),
+ quantiles=quantiles,
)
- )
- )
- query_types = torch.randint(
- 0, len(scaling_factors), (batch_size, seq_len), device=device
- )
- # map query types to offsets
- query_offsets = offset_map[query_types]
- # the kernel takes flattened offsets
- flatten_offsets = query_offsets.flatten()
+ elif provider == "flashinfer":
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: torch.ops.vllm.flashinfer_rotary_embedding(
+ positions,
+ query.clone(),
+ key.clone(),
+ head_size,
+ cos_sin_cache,
+ is_neox_style,
+ ),
+ quantiles=quantiles,
+ )
+ else:
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: rope.forward_cuda(positions, query.clone(), key.clone()),
+ quantiles=quantiles,
+ )
+
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms
- # batched queries of the same type together for non-batched RoPE
- queries = [query[query_types == i] for i in range(len(scaling_factors))]
- keys = [key[query_types == i] for i in range(len(scaling_factors))]
- packed_qkr = zip(queries, keys, non_batched_ropes)
- # synchronize before start timing
- torch.cuda.synchronize()
- with nvtx.annotate("non-batched", color="yellow"):
- for q, k, r in packed_qkr:
- r.forward(positions, q, k)
- torch.cuda.synchronize()
- with nvtx.annotate("batched", color="green"):
- batched_rope.forward(positions, query, key, flatten_offsets)
- torch.cuda.synchronize()
+ return benchmark
if __name__ == "__main__":
@@ -116,17 +95,12 @@ def benchmark_rope_kernels_multi_lora(
parser.add_argument(
"--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
)
+ parser.add_argument("--save-path", type=str, default="./configs/rope/")
args = parser.parse_args()
- print(args)
- benchmark_rope_kernels_multi_lora(
- is_neox_style=args.is_neox_style,
- batch_size=args.batch_size,
- seq_len=args.seq_len,
- num_heads=args.num_heads,
- head_size=args.head_size,
- rotary_dim=args.rotary_dim,
- dtype=getattr(torch, args.dtype),
- seed=args.seed,
- device=args.device,
+ # Get the benchmark function
+ benchmark = get_benchmark(
+ args.head_size, args.rotary_dim, args.is_neox_style, args.device
)
+ # Run performance benchmark
+ benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index f5b5c6c97d48..b0be1e3a69a6 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -55,6 +55,10 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
----------------------------------------------------------------------------------------------------
```
+If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
+and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
+benchmark-only runtime so the reported throughput stays comparable).
+
### JSON configuration file for synthetic conversations generation
The input flag `--input-file` is used to determine the input conversations for the benchmark.
diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py
index 2674899d1cc5..8cb8a2f386a9 100644
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -11,6 +11,7 @@
Color,
logger,
)
+from tqdm import tqdm
from transformers import AutoTokenizer # type: ignore
# Conversation ID is a string (e.g: "UzTK34D")
@@ -417,6 +418,10 @@ def generate_conversations(
data = file.read()
tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
list_of_tokens.extend(tokens_in_file)
+ logger.info(
+ f"Loaded {len(tokens_in_file)} tokens from file {filename}, "
+ f"total tokens so far: {len(list_of_tokens)}"
+ )
conversations: ConversationsMap = {}
conv_id = 0
@@ -449,18 +454,25 @@ def generate_conversations(
)
base_offset += common_prefix_tokens
- for conv_id in range(args.num_conversations):
+ for conv_id in tqdm(
+ range(args.num_conversations),
+ total=args.num_conversations,
+ desc="Generating conversations",
+ unit="conv",
+ ):
# Generate a single conversation
messages: MessagesList = []
nturns = turn_count[conv_id]
# User prompt token count per turn (with lower limit)
- input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
+ input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int)
input_token_count = np.maximum(input_token_count, base_prompt_token_count)
# Assistant answer token count per turn (with lower limit)
- output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
+ output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype(
+ int
+ )
output_token_count = np.maximum(output_token_count, 1)
user_turn = True
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 5d2ac66e5ab9..e23f6b923f1b 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -55,6 +55,7 @@ class ClientArgs(NamedTuple):
verify_output: bool
conversation_sampling: ConversationSampling
request_rate: float
+ max_retries: int
class RequestArgs(NamedTuple):
@@ -63,6 +64,7 @@ class RequestArgs(NamedTuple):
stream: bool
limit_min_tokens: int # Use negative value for no limit
limit_max_tokens: int # Use negative value for no limit
+ timeout_sec: int
class BenchmarkArgs(NamedTuple):
@@ -214,6 +216,7 @@ async def send_request(
stream: bool = True,
min_tokens: int | None = None,
max_tokens: int | None = None,
+ timeout_sec: int = 120,
) -> ServerResponse:
payload = {
"model": model,
@@ -235,10 +238,16 @@ async def send_request(
headers = {"Content-Type": "application/json"}
# Calculate the timeout for the request
- timeout_sec = 120
if max_tokens is not None:
# Assume TPOT of 200ms and use max_tokens to determine timeout
- timeout_sec = max(timeout_sec, int(max_tokens * 0.2))
+ token_based_timeout = int(max_tokens * 0.2)
+ if token_based_timeout > timeout_sec:
+ timeout_sec = token_based_timeout
+ logger.info(
+ "Using timeout of %ds based on max_tokens %d",
+ timeout_sec,
+ max_tokens,
+ )
timeout = aiohttp.ClientTimeout(total=timeout_sec)
valid_response = True
@@ -409,6 +418,7 @@ async def send_turn(
req_args.stream,
min_tokens,
max_tokens,
+ req_args.timeout_sec,
)
if response.valid is False:
@@ -518,6 +528,25 @@ async def poisson_sleep(request_rate: float, verbose: bool = False) -> None:
await asyncio.sleep(interval)
+async def exponential_backoff_sleep(
+ attempt_cnt: int,
+ base_rate: float = 1.0,
+ backoff_factor: float = 2.0,
+ jitter_fraction: float = 0.10,
+ verbose: bool = False,
+) -> None:
+ # Sleep with exponential backoff and jitter after a failed request.
+ backoff_delay = base_rate * (backoff_factor**attempt_cnt)
+ jittered_delay = backoff_delay * (
+ 1 + np.random.uniform(-jitter_fraction, jitter_fraction)
+ )
+
+ if verbose:
+ logger.info(f"Backoff for {jittered_delay:.3f} seconds...")
+
+ await asyncio.sleep(jittered_delay)
+
+
async def client_main(
args: ClientArgs,
req_args: RequestArgs,
@@ -532,8 +561,11 @@ async def client_main(
f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501
)
- random.seed(args.seed)
- np.random.seed(args.seed)
+ # Set unique seed per client (each client runs in its own process)
+ # Add 1 to ensure no client uses the same seed as the main process
+ client_seed = args.seed + client_id + 1
+ random.seed(client_seed)
+ np.random.seed(client_seed)
# Active conversations
active_convs: ConversationsMap = {}
@@ -646,49 +678,62 @@ async def client_main(
)
time_of_last_turn[conv_id] = curr_time_sec
- success = True
- try:
- result = await send_turn(
- session,
- client_id,
- conv_id,
- messages,
- current_turn,
- tokenizer,
- req_args,
- args.print_content,
- args.verify_output,
- )
- if result is not None:
- result_queue.put(result)
- else:
- # None means that the request failed,
- # and should not be added to the statistics.
- success = False
- num_failures += 1
-
- logger.warning(
- f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
+ success = False
+ for attempt_cnt in range(args.max_retries + 1):
+ try:
+ exception = False
+ result = await send_turn(
+ session,
+ client_id,
+ conv_id,
+ messages,
+ current_turn,
+ tokenizer,
+ req_args,
+ args.print_content,
+ args.verify_output,
+ )
+ if result is not None:
+ result_queue.put(result)
+ success = True
+ break
+ else:
+ logger.warning(
+ f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
+ )
+ except asyncio.exceptions.TimeoutError:
+ exception = True
+ logger.error(
+ "%sClient %d - Timeout during conversation ID %s (turn: %d). "
+ "Base timeout is %ss (set with --request-timeout-sec), but the "
+ "effective timeout may be longer based on max_tokens. If this "
+ "is unexpected, consider increasing the timeout or checking "
+ "model performance.%s",
+ Color.RED,
+ client_id,
+ conv_id,
+ current_turn,
+ req_args.timeout_sec,
+ Color.RESET,
+ )
+ except Exception:
+ exception = True
+ logger.exception(
+ f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
)
- # Remove the conversation (should not be used again)
- active_convs.pop(conv_id)
+ # Sleep before retry if not last attempt
+ if not success and attempt_cnt < args.max_retries:
+ await exponential_backoff_sleep(attempt_cnt, verbose=args.verbose)
- except asyncio.exceptions.TimeoutError:
+ if not success:
num_failures += 1
- logger.exception(
- f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
- )
- break # Exit gracefully instead of raising an error
+ # Remove the conversation (should not be used again)
+ active_convs.pop(conv_id)
+ if exception:
+ break # Exit gracefully instead of raising an error
- except Exception:
- num_failures += 1
- logger.exception(
- f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
- )
- break # Exit gracefully instead of raising an error
-
- if success:
+ else:
num_successes += 1
# Update the turns counter to include the LLM response
@@ -803,6 +848,7 @@ def get_client_config(
verify_output=args.verify_output,
conversation_sampling=args.conversation_sampling,
request_rate=args.request_rate,
+ max_retries=args.max_retries,
)
if args.limit_min_tokens > 0 or args.limit_max_tokens > 0:
@@ -815,6 +861,9 @@ def get_client_config(
"Invalid min/max tokens limits (min should not be larger than max)"
)
+ if args.request_timeout_sec <= 0:
+ raise ValueError("Request timeout must be a positive number")
+
# Arguments for API requests
chat_url = f"{args.url}/v1/chat/completions"
model_name = args.served_model_name if args.served_model_name else args.model
@@ -825,6 +874,7 @@ def get_client_config(
stream=not args.no_stream,
limit_min_tokens=args.limit_min_tokens,
limit_max_tokens=args.limit_max_tokens,
+ timeout_sec=args.request_timeout_sec,
)
return client_args, req_args
@@ -968,7 +1018,7 @@ async def main_mp(
f"(is alive: {client.is_alive()}){Color.RESET}"
)
- client.join(timeout=120)
+ client.join(timeout=req_args.timeout_sec + 1)
if client.is_alive():
logger.warning(
@@ -1026,6 +1076,7 @@ def process_statistics(
verbose: bool,
gen_conv_args: GenConvArgs | None = None,
excel_output: bool = False,
+ warmup_runtime_sec: float | None = None,
) -> None:
if len(client_metrics) == 0:
logger.info("No samples to process")
@@ -1119,8 +1170,13 @@ def process_statistics(
# Convert milliseconds to seconds
runtime_sec = runtime_sec / 1000.0
requests_per_sec = float(len(df)) / runtime_sec
-
- params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
+ params = {
+ "runtime_sec": runtime_sec,
+ "requests_per_sec": requests_per_sec,
+ }
+ if warmup_runtime_sec is not None:
+ params["warmup_runtime_sec"] = warmup_runtime_sec
+ params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
# Generate a summary of relevant metrics (and drop irrelevant data)
df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
@@ -1334,6 +1390,16 @@ async def main() -> None:
help="Expected request rate (Poisson process) per client in requests/sec."
"Set to 0 for no delay between requests.",
)
+ parser.add_argument(
+ "--max-retries",
+ type=int,
+ default=int(os.environ.get("MULTITURN_BENCH_MAX_RETRIES", "0")),
+ help="Maximum number of retry attempts for timed-out requests. "
+ "Default is 0 (no retries). "
+ "Set to higher values to retry failed requests and maintain "
+ "fair workload distribution. "
+ "Can also be set via MULTITURN_BENCH_MAX_RETRIES environment variable.",
+ )
parser.add_argument(
"--conversation-sampling",
type=ConversationSampling,
@@ -1351,6 +1417,13 @@ async def main() -> None:
action="store_true",
help="Verify the LLM output (compare to the answers in the input JSON file)",
)
+ parser.add_argument(
+ "--request-timeout-sec",
+ type=int,
+ default=120,
+ help="Timeout in seconds for each API request (default: 120). "
+ "Automatically increased if max tokens imply longer decoding.",
+ )
parser.add_argument(
"--no-stream",
@@ -1426,6 +1499,7 @@ async def main() -> None:
f"Invalid --warmup-percentage={args.warmup_percentage}"
) from None
+ # Set global seeds for main process
random.seed(args.seed)
np.random.seed(args.seed)
@@ -1484,6 +1558,8 @@ async def main() -> None:
url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
)
+ warmup_runtime_sec: float | None = None
+
# Warm-up step
if args.warmup_step:
# Only send a single user prompt from every conversation.
@@ -1498,26 +1574,56 @@ async def main() -> None:
# all clients should finish their work before exiting
warmup_bench_args = bench_args._replace(early_stop=False)
- logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
+ logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
+ warmup_start_ns = time.perf_counter_ns()
conversations, _ = await main_mp(
warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
)
- logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
+ warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
+ logger.info(
+ "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+ Color.PURPLE,
+ warmup_runtime_sec,
+ warmup_runtime_sec * 1000,
+ Color.RESET,
+ )
+ logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
# Run the benchmark
- start_time = time.perf_counter_ns()
+ benchmark_start_ns = time.perf_counter_ns()
client_convs, client_metrics = await main_mp(
client_args, req_args, bench_args, tokenizer, conversations
)
- total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
+ benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
# Calculate requests per second
- total_runtime_sec = total_runtime_ms / 1000.0
- rps = len(client_metrics) / total_runtime_sec
+ requests_per_sec = len(client_metrics) / benchmark_runtime_sec
+ benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
logger.info(
- f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
- f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
+ "%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
+ "requests per second: %.3f%s",
+ Color.GREEN,
+ benchmark_runtime_sec,
+ benchmark_runtime_ms,
+ requests_per_sec,
+ Color.RESET,
)
+ if warmup_runtime_sec is not None:
+ total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
+ logger.info(
+ "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+ Color.GREEN,
+ warmup_runtime_sec,
+ warmup_runtime_sec * 1000,
+ Color.RESET,
+ )
+ logger.info(
+ "%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
+ Color.GREEN,
+ total_runtime_sec,
+ total_runtime_sec * 1000,
+ Color.RESET,
+ )
# Benchmark parameters
params = {
@@ -1542,6 +1648,7 @@ async def main() -> None:
verbose=args.verbose,
gen_conv_args=gen_conv_args,
excel_output=args.excel_output,
+ warmup_runtime_sec=warmup_runtime_sec,
)
if args.output_file is not None:
diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt
index f0e1935914a1..bae656a5c5c4 100644
--- a/benchmarks/multi_turn/requirements.txt
+++ b/benchmarks/multi_turn/requirements.txt
@@ -2,4 +2,5 @@ numpy>=1.24
pandas>=2.0.0
aiohttp>=3.10
transformers>=4.46
-xlsxwriter>=3.2.1
\ No newline at end of file
+xlsxwriter>=3.2.1
+tqdm>=4.66
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index dbda19fbcbf2..aa84125818d1 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -15,6 +15,7 @@ endif()
#
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
+set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
include_directories("${CMAKE_SOURCE_DIR}/csrc")
@@ -140,6 +141,22 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(ENABLE_AVX512VNNI OFF)
message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
endif()
+
+ find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND)
+ if (AMXBF16_FOUND OR ENABLE_AMXBF16)
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+ CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+ list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile")
+ set(ENABLE_AMXBF16 ON)
+ add_compile_definitions(-DCPU_CAPABILITY_AMXBF16)
+ else()
+ set(ENABLE_AMXBF16 OFF)
+ message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3")
+ endif()
+ else()
+ set(ENABLE_AMXBF16 OFF)
+ message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.")
+ endif()
elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
@@ -193,7 +210,30 @@ endif()
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
+ set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
if(ASIMD_FOUND)
+ # Set number of parallel build processes
+ include(ProcessorCount)
+ ProcessorCount(NPROC)
+ if(NOT NPROC)
+ set(NPROC 4)
+ endif()
+ # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+ # and create a local shim dir with it
+ vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+ find_library(OPEN_MP
+ NAMES gomp
+ PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
+ NO_DEFAULT_PATH
+ REQUIRED
+ )
+ # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
+ if (OPEN_MP)
+ set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+ endif()
+
+ # Fetch and populate ACL
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
else()
@@ -202,43 +242,58 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
- GIT_TAG v52.2.0
+ GIT_TAG v52.6.0
GIT_SHALLOW TRUE
GIT_PROGRESS TRUE
)
set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
+ set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build")
endif()
- # Build ACL with scons
- include(ProcessorCount)
- ProcessorCount(_NPROC)
- set(_scons_cmd
- scons -j${_NPROC}
- Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
- arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
- multi_isa=1 openmp=1 cppthreads=0
+ # Build ACL with CMake
+ set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
+ set(CMAKE_BUILD_TYPE "Release")
+ set(ARM_COMPUTE_ARCH "armv8.2-a")
+ set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
+ set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
+ set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+ set(ARM_COMPUTE_ENABLE_OPENMP "ON")
+ set(ARM_COMPUTE_ENABLE_WERROR "OFF")
+ set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
+ set(ARM_COMPUTE_BUILD_TESTING "OFF")
+
+ set(_cmake_config_cmd
+ ${CMAKE_COMMAND} -G Ninja -B build
+ -DARM_COMPUTE_BUILD_SHARED_LIB=OFF
+ -DCMAKE_BUILD_TYPE=Release
+ -DARM_COMPUTE_ARCH=armv8.2-a
+ -DARM_COMPUTE_ENABLE_ASSERTS=OFF
+ -DARM_COMPUTE_ENABLE_CPPTHREADS=OFF
+ -DARM_COMPUTE_ENABLE_OPENMP=ON
+ -DARM_COMPUTE_ENABLE_WERROR=OFF
+ -DARM_COMPUTE_BUILD_EXAMPLES=OFF
+ -DARM_COMPUTE_BUILD_TESTING=OFF)
+ set(_cmake_build_cmd
+ ${CMAKE_COMMAND} --build build -- -j${NPROC}
)
- # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
- # and create a local shim dir with it
- include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
- vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
-
- if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
- list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
- endif()
-
execute_process(
- COMMAND ${_scons_cmd}
+ COMMAND ${_cmake_config_cmd}
+ WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+ )
+ execute_process(
+ COMMAND ${_cmake_build_cmd}
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
RESULT_VARIABLE _acl_rc
)
+
if(NOT _acl_rc EQUAL 0)
message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
endif()
+ message(STATUS "Arm Compute Library (ACL) built successfully.")
- set(ONEDNN_AARCH64_USE_ACL "ON")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+ # VLLM/oneDNN settings for ACL
+ set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
add_compile_definitions(VLLM_USE_ACL)
endif()
@@ -255,7 +310,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
- GIT_TAG v3.9
+ GIT_TAG v3.10
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
@@ -275,7 +330,10 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
set(ONEDNN_VERBOSE "OFF")
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+ set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+ set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
FetchContent_MakeAvailable(oneDNN)
+ set(CMAKE_BUILD_TYPE ${VLLM_BUILD_TYPE})
add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
target_include_directories(
dnnl_ext
@@ -305,14 +363,14 @@ endif()
#
set(VLLM_EXT_SRC
"csrc/cpu/activation.cpp"
- "csrc/cpu/attention.cpp"
- "csrc/cpu/cache.cpp"
"csrc/cpu/utils.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
- "csrc/cpu/torch_bindings.cpp"
- "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
+ "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
+ "csrc/cpu/cpu_attn.cpp"
+ "csrc/cpu/scratchpad_manager.cpp"
+ "csrc/cpu/torch_bindings.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 931090db50e9..567c8959f045 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
+ GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
deleted file mode 100644
index 82862fea7f2b..000000000000
--- a/csrc/cpu/attention.cpp
+++ /dev/null
@@ -1,798 +0,0 @@
-#include "cpu_types.hpp"
-
-namespace {
-
-template
-struct KernelVecType {
- using q_load_vec_type = void;
- using q_vec_type = void;
- using k_load_vec_type = void;
- using k_vec_type = void;
- using qk_acc_vec_type = void;
- using v_load_vec_type = void;
-};
-
-template <>
-struct KernelVecType {
- using q_load_vec_type = vec_op::FP32Vec4;
- using q_vec_type = vec_op::FP32Vec16;
- using k_load_vec_type = vec_op::FP32Vec16;
- using k_vec_type = vec_op::FP32Vec16;
- using qk_acc_vec_type = vec_op::FP32Vec16;
- using v_load_vec_type = vec_op::FP32Vec16;
-};
-
-template <>
-struct KernelVecType {
-#if defined(__powerpc64__) || defined(__s390x__)
- // Power and s390x architecture-specific vector types
- using q_load_vec_type = vec_op::FP32Vec8;
- using k_load_vec_type = vec_op::FP32Vec16;
- using v_load_vec_type = vec_op::FP32Vec16;
-#else
- // Fallback for other architectures, including x86
- using q_load_vec_type = vec_op::FP16Vec8;
- using k_load_vec_type = vec_op::FP16Vec16;
- using v_load_vec_type = vec_op::FP16Vec16;
-#endif
- using q_vec_type = vec_op::FP32Vec16;
- using k_vec_type = vec_op::FP32Vec16;
- using qk_acc_vec_type = vec_op::FP32Vec16;
-};
-
-#ifdef __AVX512BF16__
-template <>
-struct KernelVecType {
- using q_load_vec_type = vec_op::BF16Vec8;
- using q_vec_type = vec_op::BF16Vec32;
- using k_load_vec_type = vec_op::BF16Vec32;
- using k_vec_type = vec_op::BF16Vec32;
- using qk_acc_vec_type = vec_op::FP32Vec16;
- using v_load_vec_type = vec_op::BF16Vec16;
-};
-#else
- #ifdef __aarch64__
- #ifndef ARM_BF16_SUPPORT
- // pass
- #else
-template <>
-struct KernelVecType {
- using q_load_vec_type = vec_op::BF16Vec8;
- using q_vec_type = vec_op::FP32Vec16;
- using k_load_vec_type = vec_op::BF16Vec16;
- using k_vec_type = vec_op::FP32Vec16;
- using qk_acc_vec_type = vec_op::FP32Vec16;
- using v_load_vec_type = vec_op::BF16Vec16;
-};
- #endif
- #else
-template <>
-struct KernelVecType {
- using q_load_vec_type = vec_op::BF16Vec8;
- using q_vec_type = vec_op::FP32Vec16;
- using k_load_vec_type = vec_op::BF16Vec16;
- using k_vec_type = vec_op::FP32Vec16;
- using qk_acc_vec_type = vec_op::FP32Vec16;
- using v_load_vec_type = vec_op::BF16Vec16;
-};
- #endif
-#endif
-
-template
-FORCE_INLINE std::pair reduceSoftmax(T* data, const int size,
- const int capacity) {
- T max = data[0];
- for (int i = 1; i < size; ++i) {
- max = max >= data[i] ? max : data[i];
- }
-
- T sum = 0;
- for (int i = 0; i < size; ++i) {
- data[i] = std::exp(data[i] - max);
- sum += data[i];
- }
-
- int i = 0;
- for (; i < size; ++i) {
- data[i] /= sum;
- }
-
- for (; i < capacity; ++i) {
- data[i] = 0;
- }
-
- return {max, sum};
-}
-
-template
-FORCE_INLINE std::pair reduceSoftmaxAlibi(T* data, const int size,
- const int capacity,
- const float alibi_slope,
- const int start_index,
- const int seq_len) {
- data[0] += alibi_slope * (start_index - seq_len + 1);
- T max = data[0];
- for (int i = 1; i < size; ++i) {
- T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
- data[i] = qk;
- max = max >= qk ? max : qk;
- }
-
- T sum = 0;
- for (int i = 0; i < size; ++i) {
- data[i] = std::exp(data[i] - max);
- sum += data[i];
- }
-
- int i = 0;
- for (; i < size; ++i) {
- data[i] /= sum;
- }
-
- for (; i < capacity; ++i) {
- data[i] = 0;
- }
-
- return {max, sum};
-}
-
-template
-FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
- const int size) {
- T max = max_data[0];
- for (int i = 1; i < size; ++i) {
- max = max >= max_data[i] ? max : max_data[i];
- }
-
- T rescaled_sum = 0;
- for (int i = 0; i < size; ++i) {
- T rescale_factor = std::exp(max_data[i] - max);
- rescaled_sum += rescale_factor * sum_data[i];
- sum_data[i] *= rescale_factor;
- }
- for (int i = 0; i < size; ++i) {
- sum_data[i] /= rescaled_sum + 1e-8;
- }
-}
-
-template
-struct reduceQKBlockKernel {
- using q_load_vec_type = typename KernelVecType::q_load_vec_type;
- using q_vec_type = typename KernelVecType::q_vec_type;
- using k_load_vec_type = typename KernelVecType::k_load_vec_type;
- using k_vec_type = typename KernelVecType::k_vec_type;
- using qk_acc_vec_type = typename KernelVecType::qk_acc_vec_type;
-
- constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
- constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
- constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
-
- static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
- static_assert(k_load_vec_type::get_elem_num() % x == 0);
- static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
-
- FORCE_INLINE static void call(const scalar_t* __restrict__ q,
- const scalar_t* __restrict__ k_block,
- float* __restrict__ logits, float scale,
- const int token_num) {
- const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
-
- qk_acc_vec_type group_accums[MAX_GROUP_NUM];
- if (token_num == BLOCK_SIZE) {
- for (int q_offset = 0; q_offset < HEAD_SIZE;
- q_offset += x, k_block += x * BLOCK_SIZE) {
- q_load_vec_type q_load_group_vec(q + q_offset);
- q_vec_type q_group_vec(q_load_group_vec);
-
- vec_op::unroll_loop(
- [k_block, &q_group_vec, &group_accums](int token_group_idx) {
- k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
- TOKEN_PER_GROUP);
- k_vec_type k_group_vec(k_load_group_vec);
- vec_op::fma(group_accums[token_group_idx], q_group_vec,
- k_group_vec);
- vec_op::prefetch(k_block + x * BLOCK_SIZE +
- token_group_idx * x * TOKEN_PER_GROUP);
- });
- }
- } else {
- for (int q_offset = 0; q_offset < HEAD_SIZE;
- q_offset += x, k_block += x * BLOCK_SIZE) {
- q_load_vec_type q_load_group_vec(q + q_offset);
- q_vec_type q_group_vec(q_load_group_vec);
- for (int token_group_start = 0; token_group_start < group_num;
- token_group_start += UNROLL_GROUP_NUM) {
- vec_op::unroll_loop(
- [token_group_start, k_block, &q_group_vec,
- &group_accums](int token_group_idx) {
- token_group_idx += token_group_start;
- k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
- TOKEN_PER_GROUP);
- k_vec_type k_group_vec(k_load_group_vec);
- vec_op::fma(group_accums[token_group_idx], q_group_vec,
- k_group_vec);
- vec_op::prefetch(k_block + x * BLOCK_SIZE +
- token_group_idx * x * TOKEN_PER_GROUP);
- });
- }
- }
- }
-
- for (int token_group_idx = 0; token_group_idx < group_num;
- ++token_group_idx) {
- vec_op::unroll_loop(
- [&group_accums, logits, scale, token_group_idx](int token_idx) {
- float dot_v =
- group_accums[token_group_idx]
- .template reduce_sub_sum(token_idx);
- logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
- dot_v * scale;
- });
- }
- }
-};
-
-template
-FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
- acc_t&& acc) {
- using v_load_vec_type = typename KernelVecType::v_load_vec_type;
- constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
- static_assert(BLOCK_SIZE == ELEM_NUM);
- vec_op::FP32Vec16 prob_vec(prob);
-
- vec_op::unroll_loop([&](int head_elem_idx) {
- v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
- vec_op::FP32Vec16 fp32_v_vec(v_vec);
- acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
- });
-}
-}; // namespace
-
-// Paged attention v1
-namespace {
-template
-struct paged_attention_v1_impl {
- static void call(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
- // head_size/x, block_size, x]
- const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
- // head_size, block_size]
- const int num_kv_heads, const float scale,
- const int* __restrict__ block_tables, // [num_seqs,
- // max_num_blocks_per_seq]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride, const int kv_block_stride, const int kv_head_stride,
- const int num_seqs, const int num_heads) {
- constexpr int x = 16 / sizeof(scalar_t);
- const int num_queries_per_kv = num_heads / num_kv_heads;
-
- static_assert(BLOCK_SIZE == 16);
-
- int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
- int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
- TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
-
- const int parallel_work_item_num = omp_get_max_threads();
-
- size_t logits_bytes =
- parallel_work_item_num * max_seq_len_padded * sizeof(float);
- float* logits = (float*)std::aligned_alloc(
- 64, logits_bytes); // Cacheline alignment for each context token.
- // [parallel_work_item_num, max_seq_len_padded]
-
-#pragma omp parallel for collapse(2) schedule(dynamic, 1)
- for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
- for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
- int seq_len = seq_lens[seq_idx];
- const int* seq_block_table =
- block_tables + max_num_blocks_per_seq * seq_idx;
- const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
- const int64_t kv_head_idx = head_idx / num_queries_per_kv;
- const scalar_t* __restrict__ q_vec_ptr =
- q + seq_idx * q_stride + head_idx * HEAD_SIZE;
- const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
- float* __restrict__ thread_block_logits =
- logits + omp_get_thread_num() * max_seq_len_padded;
-
- // Compute logits
- for (int block_idx = 0; block_idx < block_num; ++block_idx) {
- const int64_t physical_block_idx = seq_block_table[block_idx];
- const scalar_t* __restrict__ k_block_cache_ptr =
- k_cache + physical_block_idx * kv_block_stride +
- kv_head_idx * kv_head_stride;
- float* __restrict__ head_block_logits =
- thread_block_logits + block_idx * BLOCK_SIZE;
-
- reduceQKBlockKernel::call(
- q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
- block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
- }
-
- // Compute softmax
- if (alibi_slopes) {
- reduceSoftmaxAlibi(thread_block_logits, seq_len,
- block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
- seq_len);
- } else {
- reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
- }
-
- // Compute value
- constexpr int head_elem_num_per_partition = 16;
- constexpr int head_partition_num =
- HEAD_SIZE / head_elem_num_per_partition;
- for (int head_part_idx = 0; head_part_idx < head_partition_num;
- ++head_part_idx) {
- vec_op::FP32Vec16 accums[head_elem_num_per_partition];
- scalar_t* __restrict__ out_ptr =
- out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
- head_part_idx * head_elem_num_per_partition;
- for (int block_idx = 0; block_idx < block_num; ++block_idx) {
- const int64_t physical_block_idx = seq_block_table[block_idx];
- const float* __restrict__ prob_vec_ptr =
- thread_block_logits + block_idx * BLOCK_SIZE;
- const scalar_t* __restrict__ v_block_cache_ptr =
- v_cache + physical_block_idx * kv_block_stride +
- kv_head_idx * kv_head_stride +
- BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
- reduceValueBlock(
- prob_vec_ptr, v_block_cache_ptr, accums);
-
- if (block_idx != block_num - 1) {
- const int64_t next_physical_block_idx =
- seq_block_table[block_idx + 1];
- const scalar_t* __restrict__ next_v_block_cache_ptr =
- v_cache + next_physical_block_idx * kv_block_stride +
- kv_head_idx * kv_head_stride +
- BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
- vec_op::unroll_loop(
- [&](int head_elem_idx) {
- if (head_elem_idx % 2 == 0) {
- vec_op::prefetch(next_v_block_cache_ptr +
- BLOCK_SIZE * head_elem_idx);
- }
- });
- }
- }
-
- vec_op::unroll_loop(
- [&](int head_elem_idx) {
- float value = accums[head_elem_idx].reduce_sum();
- vec_op::storeFP32(value, out_ptr + head_elem_idx);
- });
- }
- }
- }
- std::free(logits);
- }
-};
-
-#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
- paged_attention_v1_impl::call( \
- out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
- block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
- alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs, \
- num_heads);
-
-template
-void paged_attention_v1_impl_launcher(
- torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
- torch::Tensor& value_cache, int num_kv_heads, float scale,
- torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
- const std::optional& alibi_slopes) {
- int num_seqs = query.size(0);
- int num_heads = query.size(1);
- int head_size = query.size(2);
- int max_num_blocks_per_seq = block_tables.size(1);
- int q_stride = query.stride(0);
- int kv_block_stride = key_cache.stride(0);
- int kv_head_stride = key_cache.stride(1);
-
- // NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr =
- alibi_slopes
- ? reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
-
- T* out_ptr = reinterpret_cast(out.data_ptr());
- T* query_ptr = reinterpret_cast(query.data_ptr());
- T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
- T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
- int* block_tables_ptr = block_tables.data_ptr();
- int* seq_lens_ptr = seq_lens.data_ptr();
-
- switch (head_size) {
- case 32:
- LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
- break;
- case 64:
- LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
- break;
- case 80:
- LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
- break;
- case 96:
- LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
- break;
- case 112:
- LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
- break;
- case 128:
- LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
- break;
- case 192:
- LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
- break;
- case 256:
- LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
- break;
- default:
- TORCH_CHECK(false, "Unsupported head size: ", head_size);
- break;
- }
-}
-
-#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
- paged_attention_v1_impl_launcher( \
- out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
- seq_lens, max_seq_len, alibi_slopes);
-
-#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
- switch (block_size) { \
- case 16: \
- CALL_V1_KERNEL_LAUNCHER(T, 16); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
- }
-} // namespace
-
-void paged_attention_v1(
- torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
- torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
- torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
- int64_t max_seq_len, const std::optional& alibi_slopes,
- const std::string& kv_cache_dtype, torch::Tensor& k_scale,
- torch::Tensor& v_scale, const int64_t tp_rank,
- const int64_t blocksparse_local_blocks,
- const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
- const int64_t blocksparse_head_sliding_step) {
- TORCH_CHECK(blocksparse_vert_stride <= 1,
- "CPU backend does not support blocksparse attention yet.");
- VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
- [&] {
- CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
- CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
- CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
- });
-}
-
-// Paged attention v2
-namespace {
-template
-struct paged_attention_v2_impl {
- static void call(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- float* __restrict__ exp_sums, // [num_seqs, num_heads,
- // max_num_partitions]
- float* __restrict__ max_logits, // [num_seqs, num_heads,
- // max_num_partitions]
- scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
- // max_num_partitions, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
- // head_size/x, block_size, x]
- const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
- // head_size, block_size]
- const int num_kv_heads, const float scale,
- const int* __restrict__ block_tables, // [num_seqs,
- // max_num_blocks_per_seq]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride, const int kv_block_stride, const int kv_head_stride,
- const int num_seqs, const int num_heads, const int max_num_partitions) {
- constexpr int x = 16 / sizeof(scalar_t);
- const int num_queries_per_kv = num_heads / num_kv_heads;
-
- static_assert(BLOCK_SIZE == 16);
- static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
- static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
-
-#pragma omp parallel for collapse(3) schedule(static, 1)
- for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
- for (int partition_idx = 0; partition_idx < max_num_partitions;
- ++partition_idx) {
- for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
- const int seq_len = seq_lens[seq_idx];
- const int start_token_idx = partition_idx * PARTITION_SIZE;
-
- if (start_token_idx >= seq_len) continue;
-
- const int partition_num =
- (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
- const bool no_reduce = (partition_num == 1);
- const int token_num =
- (std::min(seq_len, start_token_idx + PARTITION_SIZE) -
- start_token_idx);
- const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
- const int last_block_token_num =
- token_num - (block_num - 1) * BLOCK_SIZE;
- const int* seq_block_table = block_tables +
- max_num_blocks_per_seq * seq_idx +
- start_token_idx / BLOCK_SIZE;
- const int64_t kv_head_idx = head_idx / num_queries_per_kv;
- const scalar_t* __restrict__ q_vec_ptr =
- q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-
- float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
-
- // Compute logits
- for (int block_idx = 0; block_idx < block_num; ++block_idx) {
- const int64_t physical_block_idx = seq_block_table[block_idx];
- const scalar_t* __restrict__ k_block_cache_ptr =
- k_cache + physical_block_idx * kv_block_stride +
- kv_head_idx * kv_head_stride;
- float* __restrict__ head_block_logits =
- logits + block_idx * BLOCK_SIZE;
-
- reduceQKBlockKernel::call(
- q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
- block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
- }
-
- std::pair max_and_sum;
- if (alibi_slopes) {
- max_and_sum = reduceSoftmaxAlibi(
- logits, token_num, block_num * BLOCK_SIZE,
- alibi_slopes[head_idx], start_token_idx, seq_len);
- } else {
- max_and_sum =
- reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
- }
-
- auto&& [max_logit, exp_sum] = max_and_sum;
-
- scalar_t* __restrict__ output_buffer = nullptr;
- if (!no_reduce) {
- auto idx = seq_idx * num_heads * max_num_partitions +
- head_idx * max_num_partitions + partition_idx;
- max_logits[idx] = max_logit;
- exp_sums[idx] = exp_sum;
- output_buffer =
- tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
- head_idx * max_num_partitions * HEAD_SIZE +
- partition_idx * HEAD_SIZE;
- } else {
- output_buffer =
- out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
- }
-
- // Compute value
- constexpr int head_elem_num_per_partition = 16;
- constexpr int head_partition_num =
- HEAD_SIZE / head_elem_num_per_partition;
- for (int head_part_idx = 0; head_part_idx < head_partition_num;
- ++head_part_idx) {
- vec_op::FP32Vec16 accums[head_elem_num_per_partition];
- scalar_t* __restrict__ out_ptr =
- output_buffer + head_part_idx * head_elem_num_per_partition;
- for (int block_idx = 0; block_idx < block_num; ++block_idx) {
- const int64_t physical_block_idx = seq_block_table[block_idx];
- const float* __restrict__ prob_vec_ptr =
- logits + block_idx * BLOCK_SIZE;
- const scalar_t* __restrict__ v_block_cache_ptr =
- v_cache + physical_block_idx * kv_block_stride +
- kv_head_idx * kv_head_stride +
- BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
- reduceValueBlock(
- prob_vec_ptr, v_block_cache_ptr, accums);
-
- if (block_idx != block_num - 1) {
- const int64_t next_physical_block_idx =
- seq_block_table[block_idx + 1];
- const scalar_t* __restrict__ next_v_block_cache_ptr =
- v_cache + next_physical_block_idx * kv_block_stride +
- kv_head_idx * kv_head_stride +
- BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
- vec_op::unroll_loop(
- [&](int head_elem_idx) {
- if (head_elem_idx % 2 == 0) {
- vec_op::prefetch(next_v_block_cache_ptr +
- BLOCK_SIZE * head_elem_idx);
- }
- });
- }
- }
-
- vec_op::unroll_loop(
- [&](int head_elem_idx) {
- float value = accums[head_elem_idx].reduce_sum();
- vec_op::storeFP32(value, out_ptr + head_elem_idx);
- });
- }
- }
- }
- }
-
- // Rescale partition softmax and store the factors to exp_sums
-#pragma omp parallel for collapse(2) schedule(static, 1)
- for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
- for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
- const int seq_len = seq_lens[seq_idx];
- const int partition_num =
- (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
- if (partition_num == 1) continue;
-
- reducePartitionSoftmax(
- max_logits + seq_idx * num_heads * max_num_partitions +
- head_idx * max_num_partitions,
- exp_sums + seq_idx * num_heads * max_num_partitions +
- head_idx * max_num_partitions,
- partition_num);
- }
- }
-
- // Reduce values
- using v_load_vec_type = typename KernelVecType::v_load_vec_type;
- static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
- constexpr int head_elem_num_per_group =
- 16; // Note: didn't align with the cacheline size, due to some
- // HEAD_SIZE didn't align with 64 bytes
- static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
- constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
- const float* __restrict__ rescale_factors = exp_sums;
-#pragma omp parallel for collapse(3) schedule(static, 1)
- for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
- for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
- for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
- const int seq_len = seq_lens[seq_idx];
- const int partition_num =
- (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
-
- if (partition_num == 1) continue;
-
- const float* __restrict__ seq_head_rescale_factors =
- rescale_factors + seq_idx * num_heads * max_num_partitions +
- head_idx * max_num_partitions;
- const scalar_t* __restrict__ seq_head_tmp_out =
- tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
- head_idx * max_num_partitions * HEAD_SIZE +
- group_idx * head_elem_num_per_group;
- scalar_t* __restrict__ seq_head_output =
- out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
- group_idx * head_elem_num_per_group;
-
- vec_op::FP32Vec16 acc;
- for (int i = 0; i < partition_num; ++i) {
- vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
- v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
- vec_op::FP32Vec16 fp32_value(value);
- acc = acc + fp32_value * rescale_factor;
- }
- v_load_vec_type cast_acc(acc);
- cast_acc.save(seq_head_output);
- }
- }
- }
- }
-};
-
-#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
- paged_attention_v2_impl::call( \
- out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \
- key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
- seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
- kv_block_stride, kv_head_stride, num_seqs, num_heads, \
- max_num_partitions);
-
-template
-void paged_attention_v2_impl_launcher(
- torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
- torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
- torch::Tensor& value_cache, int num_kv_heads, float scale,
- torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
- int max_seq_len, const std::optional& alibi_slopes) {
- int num_seqs = query.size(0);
- int num_heads = query.size(1);
- int head_size = query.size(2);
- int max_num_blocks_per_seq = block_tables.size(1);
- int q_stride = query.stride(0);
- int kv_block_stride = key_cache.stride(0);
- int kv_head_stride = key_cache.stride(1);
- int max_num_partitions = exp_sums.size(-1);
-
- // NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr =
- alibi_slopes
- ? reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
-
- T* out_ptr = reinterpret_cast(out.data_ptr());
- float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr());
- float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr());
- T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr());
- T* query_ptr = reinterpret_cast(query.data_ptr());
- T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
- T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
- int* block_tables_ptr = block_tables.data_ptr();
- int* seq_lens_ptr = seq_lens.data_ptr();
-
- switch (head_size) {
- case 32:
- LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
- break;
- case 64:
- LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
- break;
- case 80:
- LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
- break;
- case 96:
- LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
- break;
- case 112:
- LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
- break;
- case 128:
- LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
- break;
- case 192:
- LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
- break;
- case 256:
- LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
- break;
- default:
- TORCH_CHECK(false, "Unsupported head size: ", head_size);
- break;
- }
-}
-
-#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
- paged_attention_v2_impl_launcher( \
- out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
- num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
- alibi_slopes);
-
-#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
- switch (block_size) { \
- case 16: \
- CALL_V2_KERNEL_LAUNCHER(T, 16); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
- }
-} // namespace
-
-void paged_attention_v2(
- torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
- torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
- torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
- torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
- int64_t max_seq_len, const std::optional& alibi_slopes,
- const std::string& kv_cache_dtype, torch::Tensor& k_scale,
- torch::Tensor& v_scale, const int64_t tp_rank,
- const int64_t blocksparse_local_blocks,
- const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
- const int64_t blocksparse_head_sliding_step) {
- TORCH_CHECK(blocksparse_vert_stride <= 1,
- "CPU backend does not support blocksparse attention yet.");
- VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
- [&] {
- CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
- CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
- CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
- });
-}
\ No newline at end of file
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
deleted file mode 100644
index 69f6d06e3c96..000000000000
--- a/csrc/cpu/cache.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-#include