Skip to content

Commit a5bd54a

Browse files
committed
Merge remote-tracking branch 'upstream/main' into fix_cross_attention
Signed-off-by: fsx950223 <fsx950223@outlook.com>
2 parents 232b99f + b9ce9a3 commit a5bd54a

File tree

427 files changed

+21137
-10386
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

427 files changed

+21137
-10386
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ steps:
132132
queue: cpu_queue_postmerge
133133
commands:
134134
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
135-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
135+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
136136
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
137137
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
138138
env:

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
7878
commands=$@
7979
echo "Commands:$commands"
8080

81-
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
82-
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
83-
fi
81+
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
8482

8583
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
8684
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
8785
fi
8886

89-
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
90-
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
91-
fi
87+
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
9288

9389
if [[ $commands == *"pytest -v -s lora"* ]]; then
9490
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ function cpu_tests() {
4949
# Run kernel tests
5050
docker exec cpu-test-"$NUMA_NODE" bash -c "
5151
set -e
52+
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
5253
pytest -x -v -s tests/kernels/test_onednn.py"
5354

5455
# Run basic model test
@@ -76,7 +77,7 @@ function cpu_tests() {
7677
# Run AWQ test
7778
# docker exec cpu-test-"$NUMA_NODE" bash -c "
7879
# set -e
79-
# VLLM_USE_V1=0 pytest -x -s -v \
80+
# pytest -x -s -v \
8081
# tests/quantization/test_ipex_quant.py"
8182

8283
# Run multi-lora tests
@@ -116,4 +117,4 @@ function cpu_tests() {
116117

117118
# All of CPU tests are expected to be finished less than 40 mins.
118119
export -f cpu_tests
119-
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
120+
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/test-amd.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ steps:
348348
- pytest -v -s -m 'not cpu_test' v1/metrics
349349
- pytest -v -s v1/test_oracle.py
350350
- pytest -v -s v1/test_request.py
351+
- pytest -v -s v1/test_outputs.py
351352
# Integration test for streaming correctness (requires special branch).
352353
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
353354
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

.buildkite/test-pipeline.yaml

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ steps:
329329
- pytest -v -s -m 'not cpu_test' v1/metrics
330330
- pytest -v -s v1/test_oracle.py
331331
- pytest -v -s v1/test_request.py
332+
- pytest -v -s v1/test_outputs.py
332333
# Integration test for streaming correctness (requires special branch).
333334
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
334335
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -450,6 +451,7 @@ steps:
450451
- pytest -v -s compile/test_decorator.py
451452
- pytest -v -s compile/test_noop_elimination.py
452453
- pytest -v -s compile/test_aot_compile.py
454+
- pytest -v -s compile/test_qk_norm_rope_fusion.py
453455

454456
- label: PyTorch Fullgraph Smoke Test # 15min
455457
timeout_in_minutes: 30
@@ -463,16 +465,16 @@ steps:
463465
- pytest -v -s compile/test_multimodal_compile.py
464466
- pytest -v -s compile/piecewise/
465467

466-
- label: PyTorch Fullgraph Test # 22min
467-
timeout_in_minutes: 35
468+
- label: PyTorch Fullgraph Test # 27min
469+
timeout_in_minutes: 40
468470
mirror_hardwares: [amdexperimental]
469471
torch_nightly: true
470472
source_file_dependencies:
471473
- vllm/
472474
- tests/compile
473475
commands:
474-
- pytest -v -s compile/test_full_graph.py
475-
# Limit to no custom ops to reduce running time
476+
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
477+
# Limit to no custom ops to reduce running time
476478
# Wrap with quotes to escape yaml and avoid starting -k string with a -
477479
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
478480

@@ -890,11 +892,16 @@ steps:
890892
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
891893
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
892894
- vllm/v1/attention/backends/flashinfer.py
895+
- vllm/v1/attention/backends/mla/cutlass_mla.py
896+
- vllm/v1/attention/backends/mla/flashinfer_mla.py
897+
- vllm/platforms/cuda.py
898+
- vllm/attention/selector.py
893899
commands:
894900
- nvidia-smi
895901
- python3 examples/offline_inference/basic/chat.py
896902
# Attention
897903
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
904+
- pytest -v -s tests/kernels/attention/test_attention_selector.py
898905
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
899906
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
900907
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -951,10 +958,13 @@ steps:
951958
- vllm/model_executor/layers/activation.py
952959
- vllm/model_executor/layers/quantization/input_quant_fp8.py
953960
- tests/compile/test_fusions_e2e.py
961+
- tests/compile/test_full_graph.py
954962
commands:
955963
- nvidia-smi
956964
# Run all e2e fusion tests
957965
- pytest -v -s tests/compile/test_fusions_e2e.py
966+
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
967+
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
958968

959969
- label: Blackwell GPT-OSS Eval
960970
timeout_in_minutes: 60
@@ -1253,7 +1263,7 @@ steps:
12531263
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
12541264
- pytest -v -s tests/distributed/test_context_parallel.py
12551265
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
1256-
- pytest -v -s tests/v1/distributed/test_dbo.py
1266+
- pytest -v -s tests/v1/distributed/test_dbo.py
12571267

12581268
##### B200 test #####
12591269
- label: Distributed Tests (B200) # optional

.github/CODEOWNERS

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
1010
/vllm/model_executor/layers/mamba @tdoublep
1111
/vllm/model_executor/model_loader @22quinn
12-
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
12+
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
1313
/vllm/vllm_flash_attn @LucasWilkinson
1414
/vllm/lora @jeejeelee
1515
/vllm/reasoning @aarnphm @chaunceyjiang
@@ -61,6 +61,16 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
6161
/vllm/model_executor/models/transformers @hmellor
6262
/tests/models/test_transformers.py @hmellor
6363

64+
# Observability
65+
/vllm/config/observability.py @markmc
66+
/vllm/v1/metrics @markmc
67+
/tests/v1/metrics @markmc
68+
/vllm/tracing.py @markmc
69+
/tests/v1/tracing/test_tracing.py @markmc
70+
/vllm/config/kv_events.py @markmc
71+
/vllm/distributed/kv_events.py @markmc
72+
/tests/distributed/test_events.py @markmc
73+
6474
# Docs
6575
/docs/mkdocs @hmellor
6676
/docs/**/*.yml @hmellor
@@ -105,11 +115,21 @@ mkdocs.yaml @hmellor
105115
/vllm/attention/ops/triton_unified_attention.py @tdoublep
106116

107117
# ROCm related: specify owner with write access to notify AMD folks for careful code review
108-
/docker/Dockerfile.rocm* @gshtras
109-
/vllm/v1/attention/backends/rocm*.py @gshtras
110-
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
111-
/vllm/attention/ops/rocm*.py @gshtras
112-
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
118+
/vllm/**/*rocm* @tjtanaa
119+
/docker/Dockerfile.rocm* @gshtras @tjtanaa
120+
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
121+
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
122+
/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
123+
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
124+
/csrc/rocm @gshtras @tjtanaa
125+
/requirements/*rocm* @tjtanaa
126+
/tests/**/*rocm* @tjtanaa
127+
/docs/**/*rocm* @tjtanaa
128+
/vllm/**/*quark* @tjtanaa
129+
/tests/**/*quark* @tjtanaa
130+
/docs/**/*quark* @tjtanaa
131+
/vllm/**/*aiter* @tjtanaa
132+
/tests/**/*aiter* @tjtanaa
113133

114134
# TPU
115135
/vllm/v1/worker/tpu* @NickLucche

.github/mergify.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,23 @@ pull_request_rules:
151151
add:
152152
- gpt-oss
153153

154+
- name: label-nvidia
155+
description: Automatically apply nvidia label
156+
conditions:
157+
- label != stale
158+
- or:
159+
- files~=cuda
160+
- files~=cutlass
161+
- files~=flashinfer
162+
- files~=trtllm
163+
- title~=(?i)NVIDIA
164+
- title~=(?i)CUDA
165+
- title~=(?i)CUTLASS
166+
actions:
167+
label:
168+
add:
169+
- nvidia
170+
154171
- name: label-rocm
155172
description: Automatically apply rocm label
156173
conditions:

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ repos:
3838
rev: 0.9.1
3939
hooks:
4040
- id: pip-compile
41-
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
41+
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
4242
files: ^requirements/test\.(in|txt)$
4343
- repo: local
4444
hooks:

CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
241241
message(STATUS "Enabling cumem allocator extension.")
242242
# link against cuda driver library
243243
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
244-
define_gpu_extension_target(
244+
define_extension_target(
245245
cumem_allocator
246246
DESTINATION vllm
247247
LANGUAGE CXX
@@ -265,6 +265,7 @@ set(VLLM_EXT_SRC
265265
"csrc/pos_encoding_kernels.cu"
266266
"csrc/activation_kernels.cu"
267267
"csrc/layernorm_kernels.cu"
268+
"csrc/fused_qknorm_rope_kernel.cu"
268269
"csrc/layernorm_quant_kernels.cu"
269270
"csrc/sampler.cu"
270271
"csrc/cuda_view.cu"
@@ -858,7 +859,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP")
858859
endif()
859860

860861
message(STATUS "Enabling C extension.")
861-
define_gpu_extension_target(
862+
define_extension_target(
862863
_C
863864
DESTINATION vllm
864865
LANGUAGE ${VLLM_GPU_LANG}
@@ -973,7 +974,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
973974
endif()
974975

975976
message(STATUS "Enabling moe extension.")
976-
define_gpu_extension_target(
977+
define_extension_target(
977978
_moe_C
978979
DESTINATION vllm
979980
LANGUAGE ${VLLM_GPU_LANG}
@@ -994,7 +995,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
994995
"csrc/rocm/skinny_gemms.cu"
995996
"csrc/rocm/attention.cu")
996997

997-
define_gpu_extension_target(
998+
define_extension_target(
998999
_rocm_C
9991000
DESTINATION vllm
10001001
LANGUAGE ${VLLM_GPU_LANG}

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
2121

2222
*Latest News* 🔥
2323

24+
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
2425
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
2526
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
2627
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).

0 commit comments

Comments
 (0)