vllm-project · zhewenl · Nov 5, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -144,7 +144,7 @@ steps:
 
 - label: Entrypoints Integration Test (API Server) # 100min
   timeout_in_minutes: 130
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
@@ -157,12 +157,21 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - >-
+      pytest -v -s entrypoints/openai
+      --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py
+      --ignore=entrypoints/openai/test_oot_registration.py
+      --ignore=entrypoints/openai/test_tensorizer_entrypoint.py
+      --ignore=entrypoints/openai/correctness/
+      --ignore=entrypoints/openai/test_collective_rpc.py
+      --ignore=entrypoints/openai/tool_parsers/
+      --ignore=entrypoints/openai/test_optional_middleware.py
+  # Skip test_optional_middleware.py since encoder models are supported on ROCm: https://github.com/vllm-project/vllm/issues/27442
   - pytest -v -s entrypoints/test_chat_utils.py
 
-- label: Entrypoints Integration Test (Pooling)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+- label: Entrypoints Integration Test (Pooling) # 10min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
@@ -173,7 +182,19 @@ steps:
   - tests/entrypoints/pooling
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/pooling
+  # Skip encoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442
+  - >-
+      pytest -v -s entrypoints/pooling
+      --ignore=entrypoints/pooling/correctness/test_mteb_score.py
+      --ignore=entrypoints/pooling/correctness/test_mteb_embed.py
+      --ignore=entrypoints/pooling/llm/test_embedding.py
+      --ignore=entrypoints/pooling/llm/test_encode.py
+      --ignore=entrypoints/pooling/openai/test_embedding.py
+      --ignore=entrypoints/pooling/openai/test_embedding_dimensions.py
+      --ignore=entrypoints/pooling/openai/test_embedding_long_text.py
+      --ignore=entrypoints/pooling/openai/test_rerank.py
+      --ignore=entrypoints/pooling/openai/test_score.py
+      --ignore=entrypoints/pooling/openai/test_truncation.py
 
 - label: Distributed Tests (4 GPUs) # 35min
   timeout_in_minutes: 50

diff --git a/pyproject.toml b/pyproject.toml
@@ -107,6 +107,7 @@ markers = [
     "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",
     "optional: optional tests that are automatically skipped, include --optional to run them",
+    "encoder_decoder: tests that use encoder-decoder models, skipped on ROCm",
 ]
 
 [tool.ty.src]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1250,13 +1250,42 @@ def pytest_addoption(parser):
 
 
 def pytest_collection_modifyitems(config, items):
-    if config.getoption("--optional"):
-        # --optional given in cli: do not skip optional tests
-        return
-    skip_optional = pytest.mark.skip(reason="need --optional option to run")
-    for item in items:
-        if "optional" in item.keywords:
-            item.add_marker(skip_optional)
+    if not config.getoption("--optional"):
+        skip_optional = pytest.mark.skip(reason="need --optional option to run")
+        for item in items:
+            if "optional" in item.keywords:
+                item.add_marker(skip_optional)
+
+    from vllm.platforms import current_platform
+
+    # Skip encoder-decoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442
+    ENCODER_DECODER_MODELS = [
+        # Encoder-decoder models
+        "openai/whisper-small",
+        "openai/whisper-large-v3-turbo",
+        # Encoder-only models (cross-encoders, embedding models)
+        "cross-encoder/ms-marco-MiniLM-L-6-v2",
+        "intfloat/e5-small",
+        "intfloat/multilingual-e5-small",
+        "BAAI/bge-reranker-base",
+        "BAAI/bge-reranker-v2-m3",
+        "BAAI/bge-base-en-v1.5",
+        "Snowflake/snowflake-arctic-embed-m-v1.5",
+        "sentence-transformers/all-MiniLM-L12-v2",
+        "sentence-transformers/stsb-roberta-base-v2",
+    ]
+
+    if current_platform.is_rocm():
+        skip_encoder_decoder = pytest.mark.skip(
+            reason="Encoder-decoder models not supported on ROCm (all ROCm attention backends only support decoder-only models)"
+        )
+        for item in items:
+            if "encoder_decoder" in item.keywords:
+                if any(
+                    encoder_model in item.nodeid
+                    for encoder_model in ENCODER_DECODER_MODELS
+                ):
+                    item.add_marker(skip_encoder_decoder)
 
 
 @pytest.fixture(scope="session")

@@ -71,8 +71,8 @@ async def test_chat_with_enable_force_include_usage(
             assert chunk.usage is None
 
 
-@pytest.fixture(scope="module")
-def transcription_server_with_force_include_usage():
+@pytest.fixture(scope="module", params=["openai/whisper-large-v3-turbo"])
+def transcription_server_with_force_include_usage(request):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -85,7 +85,7 @@ def transcription_server_with_force_include_usage():
         "0.2",
     ]
 
-    with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
+    with RemoteOpenAIServer(request.param, args) as remote_server:
         yield remote_server
 
 
@@ -100,6 +100,7 @@ async def transcription_client_with_force_include_usage(
 
 
 @pytest.mark.asyncio
+@pytest.mark.encoder_decoder
 async def test_transcription_with_enable_force_include_usage(
     transcription_client_with_force_include_usage, winning_call
 ):

@@ -44,6 +44,13 @@
 
 
 def test_empty_file():
+    from vllm.platforms import current_platform
+
+    if current_platform.is_rocm():
+        pytest.skip(
+            "intfloat/multilingual-e5-small is encoder-only, not supported on ROCm"
+        )
+
     with (
         tempfile.NamedTemporaryFile("w") as input_file,
         tempfile.NamedTemporaryFile("r") as output_file,
@@ -128,6 +135,13 @@ def test_completions_invalid_input():
 
 
 def test_embeddings():
+    from vllm.platforms import current_platform
+
+    if current_platform.is_rocm():
+        pytest.skip(
+            "intfloat/multilingual-e5-small is encoder-only, not supported on ROCm"
+        )
+
     with (
         tempfile.NamedTemporaryFile("w") as input_file,
         tempfile.NamedTemporaryFile("r") as output_file,
@@ -159,6 +173,11 @@ def test_embeddings():
 
 @pytest.mark.parametrize("input_batch", [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
 def test_score(input_batch):
+    from vllm.platforms import current_platform
+
+    if current_platform.is_rocm():
+        pytest.skip("BAAI/bge-reranker-v2-m3 is encoder-only, not supported on ROCm")
+
     with (
         tempfile.NamedTemporaryFile("w") as input_file,
         tempfile.NamedTemporaryFile("r") as output_file,

@@ -14,7 +14,6 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "openai/whisper-large-v3-turbo"
 SERVER_ARGS = ["--enforce-eager"]
 MISTRAL_FORMAT_ARGS = [
     "--tokenizer_mode",
@@ -26,19 +25,21 @@
 ]
 
 
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
+@pytest.fixture(scope="module", params=["openai/whisper-large-v3-turbo"])
+def server(request):
+    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+        yield remote_server, request.param
 
 
 @pytest_asyncio.fixture
-async def client(server):
+async def client_and_model(server):
+    server, model_name = server
     async with server.get_async_client() as async_client:
-        yield async_client
+        yield async_client, model_name
 
 
 @pytest.mark.asyncio
+@pytest.mark.encoder_decoder
 @pytest.mark.parametrize(
     "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
 )
@@ -66,6 +67,7 @@ async def test_basic_audio(mary_had_lamb, model_name):
 
 
 @pytest.mark.asyncio
+@pytest.mark.encoder_decoder
 async def test_basic_audio_with_lora(mary_had_lamb):
     """Ensure STT (transcribe) requests can pass LoRA through to generate."""
     model_name = "ibm-granite/granite-speech-3.3-2b"
@@ -137,16 +139,20 @@ async def test_non_asr_model(winning_call):
 
 
 @pytest.mark.asyncio
-async def test_bad_requests(mary_had_lamb, client):
+@pytest.mark.encoder_decoder
+async def test_bad_requests(mary_had_lamb, client_and_model):
+    client, model_name = client_and_model
     # invalid language
     with pytest.raises(openai.BadRequestError):
         await client.audio.transcriptions.create(
-            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
+            model=model_name, file=mary_had_lamb, language="hh", temperature=0.0
         )
 
 
 @pytest.mark.asyncio
-async def test_long_audio_request(mary_had_lamb, client):
+@pytest.mark.encoder_decoder
+async def test_long_audio_request(mary_had_lamb, client_and_model):
+    client, model_name = client_and_model
     mary_had_lamb.seek(0)
     audio, sr = librosa.load(mary_had_lamb)
     # Add small silence after each audio for repeatability in the split process
@@ -157,7 +163,7 @@ async def test_long_audio_request(mary_had_lamb, client):
     sf.write(buffer, repeated_audio, sr, format="WAV")
     buffer.seek(0)
     transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=buffer,
         language="en",
         response_format="text",
@@ -172,34 +178,38 @@ async def test_long_audio_request(mary_had_lamb, client):
 
 
 @pytest.mark.asyncio
-async def test_completion_endpoints(client):
+@pytest.mark.encoder_decoder
+async def test_completion_endpoints(client_and_model):
+    client, model_name = client_and_model
     # text to text model
     res = await client.chat.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         messages=[{"role": "system", "content": "You are a helpful assistant."}],
     )
     err = res.error
     assert err["code"] == 400
     assert err["message"] == "The model does not support Chat Completions API"
 
-    res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
+    res = await client.completions.create(model=model_name, prompt="Hello")
     err = res.error
     assert err["code"] == 400
     assert err["message"] == "The model does not support Completions API"
 
 
 @pytest.mark.asyncio
-async def test_streaming_response(winning_call, client):
+@pytest.mark.encoder_decoder
+async def test_streaming_response(winning_call, client_and_model):
+    client, model_name = client_and_model
     transcription = ""
     res_no_stream = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=winning_call,
         response_format="json",
         language="en",
         temperature=0.0,
     )
     res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=winning_call,
         language="en",
         temperature=0.0,
@@ -215,9 +225,11 @@ async def test_streaming_response(winning_call, client):
 
 
 @pytest.mark.asyncio
-async def test_stream_options(winning_call, client):
+@pytest.mark.encoder_decoder
+async def test_stream_options(winning_call, client_and_model):
+    client, model_name = client_and_model
     res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=winning_call,
         language="en",
         temperature=0.0,
@@ -237,13 +249,15 @@ async def test_stream_options(winning_call, client):
 
 
 @pytest.mark.asyncio
-async def test_sampling_params(mary_had_lamb, client):
+@pytest.mark.encoder_decoder
+async def test_sampling_params(mary_had_lamb, client_and_model):
+    client, model_name = client_and_model
     """
     Compare sampling with params and greedy sampling to assert results
     are different when extreme sampling parameters values are picked.
     """
     transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=mary_had_lamb,
         language="en",
         temperature=0.8,
@@ -259,7 +273,7 @@ async def test_sampling_params(mary_had_lamb, client):
     )
 
     greedy_transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=mary_had_lamb,
         language="en",
         temperature=0.0,
@@ -270,12 +284,14 @@ async def test_sampling_params(mary_had_lamb, client):
 
 
 @pytest.mark.asyncio
-async def test_audio_prompt(mary_had_lamb, client):
+@pytest.mark.encoder_decoder
+async def test_audio_prompt(mary_had_lamb, client_and_model):
+    client, model_name = client_and_model
     prompt = "This is a speech, recorded in a phonograph."
     # Prompts should not omit the part of original prompt while transcribing.
     prefix = "The first words I spoke in the original phonograph"
     transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=mary_had_lamb,
         language="en",
         response_format="text",
@@ -284,7 +300,7 @@ async def test_audio_prompt(mary_had_lamb, client):
     out = json.loads(transcription)["text"]
     assert prefix in out
     transcription_wprompt = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=mary_had_lamb,
         language="en",
         response_format="text",