From f0444c423df4fadad6bdc3e6ada5933b7e321327 Mon Sep 17 00:00:00 2001 From: zhewenli Date: Wed, 5 Nov 2025 11:56:28 -0800 Subject: [PATCH 1/6] update Signed-off-by: zhewenli --- .buildkite/test-amd.yaml | 2 +- pyproject.toml | 1 + tests/conftest.py | 42 +++++++++++++++---- .../openai/test_transcription_validation.py | 9 ++++ .../openai/test_translation_validation.py | 5 +++ tests/entrypoints/openai/test_vision.py | 11 +++++ .../pooling/correctness/test_mteb_score.py | 2 + .../entrypoints/pooling/llm/test_embedding.py | 2 + tests/entrypoints/pooling/llm/test_encode.py | 2 + .../pooling/openai/test_embedding.py | 2 + .../openai/test_embedding_long_text.py | 2 + .../entrypoints/pooling/openai/test_rerank.py | 2 + .../pooling/openai/test_vision_embedding.py | 2 + .../models/language/pooling/test_embedding.py | 16 +++++-- 14 files changed, 89 insertions(+), 11 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index bb5ef5d62463..f0493a1afd26 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -727,7 +727,7 @@ steps: - label: Language Models Tests (Standard) timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking torch_nightly: true diff --git a/pyproject.toml b/pyproject.toml index 29ee7f75f070..8f35054a0799 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,7 @@ markers = [ "distributed: run this test only in distributed GPU tests", "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", + "encoder_decoder: tests that use encoder-decoder models, skipped on ROCm", ] [tool.ty.src] diff --git a/tests/conftest.py b/tests/conftest.py index 5e127e4e939e..4a79041ecdf1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1250,13 +1250,41 @@ def pytest_addoption(parser): def pytest_collection_modifyitems(config, items): - if config.getoption("--optional"): - # --optional given in cli: do not skip optional tests - return - skip_optional = pytest.mark.skip(reason="need --optional option to run") - for item in items: - if "optional" in item.keywords: - item.add_marker(skip_optional) + if not config.getoption("--optional"): + skip_optional = pytest.mark.skip(reason="need --optional option to run") + for item in items: + if "optional" in item.keywords: + item.add_marker(skip_optional) + + from vllm.platforms import current_platform + + # Skip encoder-decoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442 + ENCODER_DECODER_MODELS = [ + # Encoder-decoder models + "openai/whisper-small", + "openai/whisper-large-v3-turbo", + "mistralai/Voxtral-Mini-3B-2507", + "microsoft/Phi-3.5-vision-instruct", + # Encoder-only models (cross-encoders, embedding models) + "cross-encoder/ms-marco-MiniLM-L-6-v2", + "intfloat/multilingual-e5-small", + "BAAI/bge-reranker-base", + "BAAI/bge-base-en-v1.5", + "TIGER-Lab/VLM2Vec-Full", + "sentence-transformers/all-MiniLM-L12-v2", + "sentence-transformers/stsb-roberta-base-v2", + ] + + if current_platform.is_rocm(): + skip_encoder_decoder = pytest.mark.skip( + reason="Encoder-decoder models not supported on ROCm (all ROCm attention backends only support decoder-only models)" + ) + for item in items: + if "encoder_decoder" in item.keywords: + for encoder_model in ENCODER_DECODER_MODELS: + if encoder_model in item.nodeid: + item.add_marker(skip_encoder_decoder) + break @pytest.fixture(scope="session") diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 88580ed899f1..dc28eae3c87f 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -39,6 +39,7 @@ async def client(server): @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize( "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"] ) @@ -66,6 +67,7 @@ async def test_basic_audio(mary_had_lamb, model_name): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_basic_audio_with_lora(mary_had_lamb): """Ensure STT (transcribe) requests can pass LoRA through to generate.""" model_name = "ibm-granite/granite-speech-3.3-2b" @@ -137,6 +139,7 @@ async def test_non_asr_model(winning_call): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_bad_requests(mary_had_lamb, client): # invalid language with pytest.raises(openai.BadRequestError): @@ -146,6 +149,7 @@ async def test_bad_requests(mary_had_lamb, client): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_long_audio_request(mary_had_lamb, client): mary_had_lamb.seek(0) audio, sr = librosa.load(mary_had_lamb) @@ -172,6 +176,7 @@ async def test_long_audio_request(mary_had_lamb, client): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_completion_endpoints(client): # text to text model res = await client.chat.completions.create( @@ -189,6 +194,7 @@ async def test_completion_endpoints(client): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_streaming_response(winning_call, client): transcription = "" res_no_stream = await client.audio.transcriptions.create( @@ -215,6 +221,7 @@ async def test_streaming_response(winning_call, client): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_stream_options(winning_call, client): res = await client.audio.transcriptions.create( model=MODEL_NAME, @@ -237,6 +244,7 @@ async def test_stream_options(winning_call, client): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_sampling_params(mary_had_lamb, client): """ Compare sampling with params and greedy sampling to assert results @@ -270,6 +278,7 @@ async def test_sampling_params(mary_had_lamb, client): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_audio_prompt(mary_had_lamb, client): prompt = "This is a speech, recorded in a phonograph." # Prompts should not omit the part of original prompt while transcribing. diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index c060ee2b1922..16e6c142534a 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -84,6 +84,7 @@ async def test_basic_audio_with_lora(mary_had_lamb): # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation! @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_basic_audio(foscolo, client_and_model): client, model_name = client_and_model translation = await client.audio.translations.create( @@ -99,6 +100,7 @@ async def test_basic_audio(foscolo, client_and_model): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_audio_prompt(foscolo, client_and_model): client, model_name = client_and_model # Condition whisper on starting text @@ -117,6 +119,7 @@ async def test_audio_prompt(foscolo, client_and_model): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_streaming_response(foscolo, client_and_model, server): client, model_name = client_and_model translation = "" @@ -168,6 +171,7 @@ async def test_streaming_response(foscolo, client_and_model, server): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_stream_options(foscolo, server): server, model_name = server url = server.url_for("v1/audio/translations") @@ -207,6 +211,7 @@ async def test_stream_options(foscolo, server): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_long_audio_request(foscolo, client_and_model): client, model_name = client_and_model if model_name == "google/gemma-3n-E2B-it": diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 2a7df08ea3b0..0022739a2621 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -122,6 +122,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image( @@ -169,6 +170,7 @@ async def test_single_chat_session_image( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_error_on_invalid_image_url_type( @@ -196,6 +198,7 @@ async def test_error_on_invalid_image_url_type( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image_beamsearch( @@ -221,6 +224,7 @@ async def test_single_chat_session_image_beamsearch( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) @@ -277,6 +281,7 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS)))) async def test_single_chat_session_image_base64encoded_beamsearch( @@ -307,6 +312,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_chat_streaming_image( @@ -350,6 +356,7 @@ async def test_chat_streaming_image( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", @@ -391,6 +398,7 @@ async def test_multi_image_input( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", @@ -430,6 +438,7 @@ async def test_completions_with_image( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", @@ -493,6 +502,7 @@ async def test_completions_with_image_with_uuid( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_completions_with_empty_image_with_uuid_without_cache_hit( client: openai.AsyncOpenAI, @@ -522,6 +532,7 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit( @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py index 1afe68b189db..50d73f080e65 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_score.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py @@ -28,6 +28,7 @@ def server(): yield remote_server +@pytest.mark.encoder_decoder def test_mteb_score(server): url = server.url_for("score") encoder = ScoreClientMtebEncoder(MODEL_NAME, url) @@ -42,6 +43,7 @@ def test_mteb_score(server): assert st_main_score - vllm_main_score < MTEB_RERANK_TOL +@pytest.mark.encoder_decoder def test_mteb_rerank(server): url = server.url_for("rerank") encoder = RerankClientMtebEncoder(MODEL_NAME, url) diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py index 5455b5f91fc0..5c55f9d3b789 100644 --- a/tests/entrypoints/pooling/llm/test_embedding.py +++ b/tests/entrypoints/pooling/llm/test_embedding.py @@ -10,6 +10,8 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory +pytestmark = pytest.mark.encoder_decoder + MODEL_NAME = "intfloat/multilingual-e5-small" prompts = ["The chef prepared a delicious meal."] diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py index ca85d2758fce..dd98857f6025 100644 --- a/tests/entrypoints/pooling/llm/test_encode.py +++ b/tests/entrypoints/pooling/llm/test_encode.py @@ -8,6 +8,8 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory +pytestmark = pytest.mark.encoder_decoder + MODEL_NAME = "intfloat/multilingual-e5-small" PROMPTS = [ diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index e971b23e8f1a..a7e4e857d2ee 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -28,6 +28,8 @@ decode_pooling_output, ) +pytestmark = pytest.mark.encoder_decoder + MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DTYPE = "bfloat16" diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py index f977c81a9084..4a24201af3df 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py +++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py @@ -17,6 +17,8 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import EmbeddingResponse +pytestmark = pytest.mark.encoder_decoder + def _generate_random_text(word_count: int) -> str: """Generate random text with approximately the specified word count.""" diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index 1d85190c12a1..1d065c7687aa 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -9,6 +9,8 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse +pytestmark = pytest.mark.encoder_decoder + MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/openai/test_vision_embedding.py index 944392d66fa5..3d0fbc9a38a9 100644 --- a/tests/entrypoints/pooling/openai/test_vision_embedding.py +++ b/tests/entrypoints/pooling/openai/test_vision_embedding.py @@ -11,6 +11,8 @@ from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.multimodal.utils import encode_image_base64, fetch_image +pytestmark = pytest.mark.encoder_decoder + MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index c8deffbf66db..fd4095188c28 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -36,14 +36,24 @@ pytest.mark.core_model, pytest.mark.cpu_model, pytest.mark.slow_test, + pytest.mark.encoder_decoder, ], ), - pytest.param("sentence-transformers/all-MiniLM-L12-v2"), - pytest.param("intfloat/multilingual-e5-small"), + pytest.param( + "sentence-transformers/all-MiniLM-L12-v2", + marks=[pytest.mark.encoder_decoder], + ), + pytest.param( + "intfloat/multilingual-e5-small", marks=[pytest.mark.encoder_decoder] + ), # [Cross-Encoder] pytest.param( "sentence-transformers/stsb-roberta-base-v2", - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[ + pytest.mark.core_model, + pytest.mark.cpu_model, + pytest.mark.encoder_decoder, + ], ), ], ) From bc57795c796cda0218d8c9139e36673fd4e430ec Mon Sep 17 00:00:00 2001 From: zhewenli Date: Thu, 6 Nov 2025 13:28:50 -0800 Subject: [PATCH 2/6] update tests Signed-off-by: zhewenli --- .buildkite/test-amd.yaml | 31 ++++++++-- tests/conftest.py | 12 ++-- .../openai/test_enable_force_include_usage.py | 7 ++- tests/entrypoints/openai/test_run_batch.py | 19 +++++++ .../openai/test_transcription_validation.py | 57 +++++++++++-------- tests/entrypoints/openai/test_vision.py | 11 ---- .../pooling/correctness/test_mteb_score.py | 2 - .../entrypoints/pooling/llm/test_embedding.py | 2 - tests/entrypoints/pooling/llm/test_encode.py | 2 - .../pooling/openai/test_embedding.py | 2 - .../openai/test_embedding_long_text.py | 2 - .../entrypoints/pooling/openai/test_rerank.py | 2 - .../pooling/openai/test_vision_embedding.py | 2 - 13 files changed, 90 insertions(+), 61 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index f0493a1afd26..cccfd73ef8a4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -144,7 +144,7 @@ steps: - label: Entrypoints Integration Test (API Server) # 100min timeout_in_minutes: 130 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking working_dir: "/vllm-workspace/tests" @@ -157,12 +157,21 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - >- + pytest -v -s entrypoints/openai + --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py + --ignore=entrypoints/openai/test_oot_registration.py + --ignore=entrypoints/openai/test_tensorizer_entrypoint.py + --ignore=entrypoints/openai/correctness/ + --ignore=entrypoints/openai/test_collective_rpc.py + --ignore=entrypoints/openai/tool_parsers/ + --ignore=entrypoints/openai/test_vision.py + --ignore=entrypoints/openai/test_optional_middleware.py - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking working_dir: "/vllm-workspace/tests" @@ -173,7 +182,21 @@ steps: - tests/entrypoints/pooling commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/pooling + # Encoder-only models not supported on ROCm (all tests use encoder/cross-encoder models) + # See .buildkite/rocm_encoder_decoder_skip.md for details + - >- + pytest -v -s entrypoints/pooling + --ignore=entrypoints/pooling/correctness/test_mteb_score.py + --ignore=entrypoints/pooling/correctness/test_mteb_embed.py + --ignore=entrypoints/pooling/llm/test_embedding.py + --ignore=entrypoints/pooling/llm/test_encode.py + --ignore=entrypoints/pooling/openai/test_embedding.py + --ignore=entrypoints/pooling/openai/test_embedding_dimensions.py + --ignore=entrypoints/pooling/openai/test_embedding_long_text.py + --ignore=entrypoints/pooling/openai/test_rerank.py + --ignore=entrypoints/pooling/openai/test_score.py + --ignore=entrypoints/pooling/openai/test_truncation.py + --ignore=entrypoints/pooling/openai/test_vision_embedding.py - label: Distributed Tests (4 GPUs) # 35min timeout_in_minutes: 50 diff --git a/tests/conftest.py b/tests/conftest.py index 4a79041ecdf1..b1bf880cc29f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1267,10 +1267,13 @@ def pytest_collection_modifyitems(config, items): "microsoft/Phi-3.5-vision-instruct", # Encoder-only models (cross-encoders, embedding models) "cross-encoder/ms-marco-MiniLM-L-6-v2", + "intfloat/e5-small", "intfloat/multilingual-e5-small", "BAAI/bge-reranker-base", + "BAAI/bge-reranker-v2-m3", "BAAI/bge-base-en-v1.5", "TIGER-Lab/VLM2Vec-Full", + "Snowflake/snowflake-arctic-embed-m-v1.5", "sentence-transformers/all-MiniLM-L12-v2", "sentence-transformers/stsb-roberta-base-v2", ] @@ -1281,10 +1284,11 @@ def pytest_collection_modifyitems(config, items): ) for item in items: if "encoder_decoder" in item.keywords: - for encoder_model in ENCODER_DECODER_MODELS: - if encoder_model in item.nodeid: - item.add_marker(skip_encoder_decoder) - break + if any( + encoder_model in item.nodeid + for encoder_model in ENCODER_DECODER_MODELS + ): + item.add_marker(skip_encoder_decoder) @pytest.fixture(scope="session") diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py index 3ddf2308eb1d..27dc9a242f27 100644 --- a/tests/entrypoints/openai/test_enable_force_include_usage.py +++ b/tests/entrypoints/openai/test_enable_force_include_usage.py @@ -71,8 +71,8 @@ async def test_chat_with_enable_force_include_usage( assert chunk.usage is None -@pytest.fixture(scope="module") -def transcription_server_with_force_include_usage(): +@pytest.fixture(scope="module", params=["openai/whisper-large-v3-turbo"]) +def transcription_server_with_force_include_usage(request): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -85,7 +85,7 @@ def transcription_server_with_force_include_usage(): "0.2", ] - with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server: + with RemoteOpenAIServer(request.param, args) as remote_server: yield remote_server @@ -100,6 +100,7 @@ async def transcription_client_with_force_include_usage( @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_transcription_with_enable_force_include_usage( transcription_client_with_force_include_usage, winning_call ): diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 2f678a0535cc..d809d35a31e1 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -44,6 +44,13 @@ def test_empty_file(): + from vllm.platforms import current_platform + + if current_platform.is_rocm(): + pytest.skip( + "intfloat/multilingual-e5-small is encoder-only, not supported on ROCm" + ) + with ( tempfile.NamedTemporaryFile("w") as input_file, tempfile.NamedTemporaryFile("r") as output_file, @@ -128,6 +135,13 @@ def test_completions_invalid_input(): def test_embeddings(): + from vllm.platforms import current_platform + + if current_platform.is_rocm(): + pytest.skip( + "intfloat/multilingual-e5-small is encoder-only, not supported on ROCm" + ) + with ( tempfile.NamedTemporaryFile("w") as input_file, tempfile.NamedTemporaryFile("r") as output_file, @@ -159,6 +173,11 @@ def test_embeddings(): @pytest.mark.parametrize("input_batch", [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH]) def test_score(input_batch): + from vllm.platforms import current_platform + + if current_platform.is_rocm(): + pytest.skip("BAAI/bge-reranker-v2-m3 is encoder-only, not supported on ROCm") + with ( tempfile.NamedTemporaryFile("w") as input_file, tempfile.NamedTemporaryFile("r") as output_file, diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index dc28eae3c87f..cc5eae216a66 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -14,7 +14,6 @@ from ...utils import RemoteOpenAIServer -MODEL_NAME = "openai/whisper-large-v3-turbo" SERVER_ARGS = ["--enforce-eager"] MISTRAL_FORMAT_ARGS = [ "--tokenizer_mode", @@ -26,16 +25,17 @@ ] -@pytest.fixture(scope="module") -def server(): - with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: - yield remote_server +@pytest.fixture(scope="module", params=["openai/whisper-large-v3-turbo"]) +def server(request): + with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server: + yield remote_server, request.param @pytest_asyncio.fixture -async def client(server): +async def client_and_model(server): + server, model_name = server async with server.get_async_client() as async_client: - yield async_client + yield async_client, model_name @pytest.mark.asyncio @@ -140,17 +140,19 @@ async def test_non_asr_model(winning_call): @pytest.mark.asyncio @pytest.mark.encoder_decoder -async def test_bad_requests(mary_had_lamb, client): +async def test_bad_requests(mary_had_lamb, client_and_model): + client, model_name = client_and_model # invalid language with pytest.raises(openai.BadRequestError): await client.audio.transcriptions.create( - model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0 + model=model_name, file=mary_had_lamb, language="hh", temperature=0.0 ) @pytest.mark.asyncio @pytest.mark.encoder_decoder -async def test_long_audio_request(mary_had_lamb, client): +async def test_long_audio_request(mary_had_lamb, client_and_model): + client, model_name = client_and_model mary_had_lamb.seek(0) audio, sr = librosa.load(mary_had_lamb) # Add small silence after each audio for repeatability in the split process @@ -161,7 +163,7 @@ async def test_long_audio_request(mary_had_lamb, client): sf.write(buffer, repeated_audio, sr, format="WAV") buffer.seek(0) transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=buffer, language="en", response_format="text", @@ -177,17 +179,18 @@ async def test_long_audio_request(mary_had_lamb, client): @pytest.mark.asyncio @pytest.mark.encoder_decoder -async def test_completion_endpoints(client): +async def test_completion_endpoints(client_and_model): + client, model_name = client_and_model # text to text model res = await client.chat.completions.create( - model=MODEL_NAME, + model=model_name, messages=[{"role": "system", "content": "You are a helpful assistant."}], ) err = res.error assert err["code"] == 400 assert err["message"] == "The model does not support Chat Completions API" - res = await client.completions.create(model=MODEL_NAME, prompt="Hello") + res = await client.completions.create(model=model_name, prompt="Hello") err = res.error assert err["code"] == 400 assert err["message"] == "The model does not support Completions API" @@ -195,17 +198,18 @@ async def test_completion_endpoints(client): @pytest.mark.asyncio @pytest.mark.encoder_decoder -async def test_streaming_response(winning_call, client): +async def test_streaming_response(winning_call, client_and_model): + client, model_name = client_and_model transcription = "" res_no_stream = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=winning_call, response_format="json", language="en", temperature=0.0, ) res = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=winning_call, language="en", temperature=0.0, @@ -222,9 +226,10 @@ async def test_streaming_response(winning_call, client): @pytest.mark.asyncio @pytest.mark.encoder_decoder -async def test_stream_options(winning_call, client): +async def test_stream_options(winning_call, client_and_model): + client, model_name = client_and_model res = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=winning_call, language="en", temperature=0.0, @@ -245,13 +250,14 @@ async def test_stream_options(winning_call, client): @pytest.mark.asyncio @pytest.mark.encoder_decoder -async def test_sampling_params(mary_had_lamb, client): +async def test_sampling_params(mary_had_lamb, client_and_model): + client, model_name = client_and_model """ Compare sampling with params and greedy sampling to assert results are different when extreme sampling parameters values are picked. """ transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=mary_had_lamb, language="en", temperature=0.8, @@ -267,7 +273,7 @@ async def test_sampling_params(mary_had_lamb, client): ) greedy_transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=mary_had_lamb, language="en", temperature=0.0, @@ -279,12 +285,13 @@ async def test_sampling_params(mary_had_lamb, client): @pytest.mark.asyncio @pytest.mark.encoder_decoder -async def test_audio_prompt(mary_had_lamb, client): +async def test_audio_prompt(mary_had_lamb, client_and_model): + client, model_name = client_and_model prompt = "This is a speech, recorded in a phonograph." # Prompts should not omit the part of original prompt while transcribing. prefix = "The first words I spoke in the original phonograph" transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=mary_had_lamb, language="en", response_format="text", @@ -293,7 +300,7 @@ async def test_audio_prompt(mary_had_lamb, client): out = json.loads(transcription)["text"] assert prefix in out transcription_wprompt = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=mary_had_lamb, language="en", response_format="text", diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 0022739a2621..2a7df08ea3b0 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -122,7 +122,6 @@ def get_hf_prompt_tokens(model_name, content, image_url): @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image( @@ -170,7 +169,6 @@ async def test_single_chat_session_image( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_error_on_invalid_image_url_type( @@ -198,7 +196,6 @@ async def test_error_on_invalid_image_url_type( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image_beamsearch( @@ -224,7 +221,6 @@ async def test_single_chat_session_image_beamsearch( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) @@ -281,7 +277,6 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS)))) async def test_single_chat_session_image_base64encoded_beamsearch( @@ -312,7 +307,6 @@ async def test_single_chat_session_image_base64encoded_beamsearch( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_chat_streaming_image( @@ -356,7 +350,6 @@ async def test_chat_streaming_image( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", @@ -398,7 +391,6 @@ async def test_multi_image_input( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", @@ -438,7 +430,6 @@ async def test_completions_with_image( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", @@ -502,7 +493,6 @@ async def test_completions_with_image_with_uuid( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_completions_with_empty_image_with_uuid_without_cache_hit( client: openai.AsyncOpenAI, @@ -532,7 +522,6 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit( @pytest.mark.asyncio -@pytest.mark.encoder_decoder @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py index 50d73f080e65..1afe68b189db 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_score.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py @@ -28,7 +28,6 @@ def server(): yield remote_server -@pytest.mark.encoder_decoder def test_mteb_score(server): url = server.url_for("score") encoder = ScoreClientMtebEncoder(MODEL_NAME, url) @@ -43,7 +42,6 @@ def test_mteb_score(server): assert st_main_score - vllm_main_score < MTEB_RERANK_TOL -@pytest.mark.encoder_decoder def test_mteb_rerank(server): url = server.url_for("rerank") encoder = RerankClientMtebEncoder(MODEL_NAME, url) diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py index 5c55f9d3b789..5455b5f91fc0 100644 --- a/tests/entrypoints/pooling/llm/test_embedding.py +++ b/tests/entrypoints/pooling/llm/test_embedding.py @@ -10,8 +10,6 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory -pytestmark = pytest.mark.encoder_decoder - MODEL_NAME = "intfloat/multilingual-e5-small" prompts = ["The chef prepared a delicious meal."] diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py index dd98857f6025..ca85d2758fce 100644 --- a/tests/entrypoints/pooling/llm/test_encode.py +++ b/tests/entrypoints/pooling/llm/test_encode.py @@ -8,8 +8,6 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory -pytestmark = pytest.mark.encoder_decoder - MODEL_NAME = "intfloat/multilingual-e5-small" PROMPTS = [ diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index a7e4e857d2ee..e971b23e8f1a 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -28,8 +28,6 @@ decode_pooling_output, ) -pytestmark = pytest.mark.encoder_decoder - MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DTYPE = "bfloat16" diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py index 4a24201af3df..f977c81a9084 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py +++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py @@ -17,8 +17,6 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import EmbeddingResponse -pytestmark = pytest.mark.encoder_decoder - def _generate_random_text(word_count: int) -> str: """Generate random text with approximately the specified word count.""" diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index 1d065c7687aa..1d85190c12a1 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -9,8 +9,6 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse -pytestmark = pytest.mark.encoder_decoder - MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/openai/test_vision_embedding.py index 3d0fbc9a38a9..944392d66fa5 100644 --- a/tests/entrypoints/pooling/openai/test_vision_embedding.py +++ b/tests/entrypoints/pooling/openai/test_vision_embedding.py @@ -11,8 +11,6 @@ from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.multimodal.utils import encode_image_base64, fetch_image -pytestmark = pytest.mark.encoder_decoder - MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 From adce9198c8cbc48ccc077fe00256b1b741ac9160 Mon Sep 17 00:00:00 2001 From: zhewenli Date: Thu, 6 Nov 2025 13:31:23 -0800 Subject: [PATCH 3/6] update tests Signed-off-by: zhewenli --- .buildkite/test-amd.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index cccfd73ef8a4..f7826187072b 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -165,6 +165,7 @@ steps: --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + # Skip encoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442 --ignore=entrypoints/openai/test_vision.py --ignore=entrypoints/openai/test_optional_middleware.py - pytest -v -s entrypoints/test_chat_utils.py @@ -182,8 +183,7 @@ steps: - tests/entrypoints/pooling commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - # Encoder-only models not supported on ROCm (all tests use encoder/cross-encoder models) - # See .buildkite/rocm_encoder_decoder_skip.md for details + # Skip encoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442 - >- pytest -v -s entrypoints/pooling --ignore=entrypoints/pooling/correctness/test_mteb_score.py From ffc3b430a3a01c6fbfd6bd27d86e3498cb44e6a6 Mon Sep 17 00:00:00 2001 From: zhewenli Date: Thu, 6 Nov 2025 14:02:31 -0800 Subject: [PATCH 4/6] update tests Signed-off-by: zhewenli --- .buildkite/test-amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index f7826187072b..9000e0b9acb9 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -165,9 +165,9 @@ steps: --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ - # Skip encoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442 --ignore=entrypoints/openai/test_vision.py --ignore=entrypoints/openai/test_optional_middleware.py + # ^ Skip encoder models (test_vision.py, test_optional_middleware.py) on ROCm: https://github.com/vllm-project/vllm/issues/27442 - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration Test (Pooling) From bfadf5c4e97fb4eed26a9779f4b938c9ba80598c Mon Sep 17 00:00:00 2001 From: zhewenli Date: Thu, 6 Nov 2025 17:40:27 -0800 Subject: [PATCH 5/6] update tests Signed-off-by: zhewenli --- .buildkite/test-amd.yaml | 4 +--- tests/conftest.py | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 9000e0b9acb9..f098b4f74a4e 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -165,9 +165,8 @@ steps: --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ - --ignore=entrypoints/openai/test_vision.py --ignore=entrypoints/openai/test_optional_middleware.py - # ^ Skip encoder models (test_vision.py, test_optional_middleware.py) on ROCm: https://github.com/vllm-project/vllm/issues/27442 + # Skip test_optional_middleware.py since encoder models are supported on ROCm: https://github.com/vllm-project/vllm/issues/27442 - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration Test (Pooling) @@ -196,7 +195,6 @@ steps: --ignore=entrypoints/pooling/openai/test_rerank.py --ignore=entrypoints/pooling/openai/test_score.py --ignore=entrypoints/pooling/openai/test_truncation.py - --ignore=entrypoints/pooling/openai/test_vision_embedding.py - label: Distributed Tests (4 GPUs) # 35min timeout_in_minutes: 50 diff --git a/tests/conftest.py b/tests/conftest.py index b1bf880cc29f..637a96e62c0c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1263,8 +1263,6 @@ def pytest_collection_modifyitems(config, items): # Encoder-decoder models "openai/whisper-small", "openai/whisper-large-v3-turbo", - "mistralai/Voxtral-Mini-3B-2507", - "microsoft/Phi-3.5-vision-instruct", # Encoder-only models (cross-encoders, embedding models) "cross-encoder/ms-marco-MiniLM-L-6-v2", "intfloat/e5-small", @@ -1272,7 +1270,6 @@ def pytest_collection_modifyitems(config, items): "BAAI/bge-reranker-base", "BAAI/bge-reranker-v2-m3", "BAAI/bge-base-en-v1.5", - "TIGER-Lab/VLM2Vec-Full", "Snowflake/snowflake-arctic-embed-m-v1.5", "sentence-transformers/all-MiniLM-L12-v2", "sentence-transformers/stsb-roberta-base-v2", From 616ae433f70037b66f96eaec6034cb5ff6d2ee8d Mon Sep 17 00:00:00 2001 From: zhewenli Date: Thu, 6 Nov 2025 19:48:26 -0800 Subject: [PATCH 6/6] update tests Signed-off-by: zhewenli --- .buildkite/test-amd.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index f098b4f74a4e..93d966f83be4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -169,8 +169,8 @@ steps: # Skip test_optional_middleware.py since encoder models are supported on ROCm: https://github.com/vllm-project/vllm/issues/27442 - pytest -v -s entrypoints/test_chat_utils.py -- label: Entrypoints Integration Test (Pooling) - timeout_in_minutes: 50 +- label: Entrypoints Integration Test (Pooling) # 10min + timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking @@ -748,7 +748,7 @@ steps: - label: Language Models Tests (Standard) timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] agent_pool: mi325_1 # grade: Blocking torch_nightly: true