diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index bb5ef5d62463..93d966f83be4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -144,7 +144,7 @@ steps: - label: Entrypoints Integration Test (API Server) # 100min timeout_in_minutes: 130 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking working_dir: "/vllm-workspace/tests" @@ -157,12 +157,21 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - >- + pytest -v -s entrypoints/openai + --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py + --ignore=entrypoints/openai/test_oot_registration.py + --ignore=entrypoints/openai/test_tensorizer_entrypoint.py + --ignore=entrypoints/openai/correctness/ + --ignore=entrypoints/openai/test_collective_rpc.py + --ignore=entrypoints/openai/tool_parsers/ + --ignore=entrypoints/openai/test_optional_middleware.py + # Skip test_optional_middleware.py since encoder models are supported on ROCm: https://github.com/vllm-project/vllm/issues/27442 - pytest -v -s entrypoints/test_chat_utils.py -- label: Entrypoints Integration Test (Pooling) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] +- label: Entrypoints Integration Test (Pooling) # 10min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking working_dir: "/vllm-workspace/tests" @@ -173,7 +182,19 @@ steps: - tests/entrypoints/pooling commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/pooling + # Skip encoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442 + - >- + pytest -v -s entrypoints/pooling + --ignore=entrypoints/pooling/correctness/test_mteb_score.py + --ignore=entrypoints/pooling/correctness/test_mteb_embed.py + --ignore=entrypoints/pooling/llm/test_embedding.py + --ignore=entrypoints/pooling/llm/test_encode.py + --ignore=entrypoints/pooling/openai/test_embedding.py + --ignore=entrypoints/pooling/openai/test_embedding_dimensions.py + --ignore=entrypoints/pooling/openai/test_embedding_long_text.py + --ignore=entrypoints/pooling/openai/test_rerank.py + --ignore=entrypoints/pooling/openai/test_score.py + --ignore=entrypoints/pooling/openai/test_truncation.py - label: Distributed Tests (4 GPUs) # 35min timeout_in_minutes: 50 diff --git a/pyproject.toml b/pyproject.toml index 29ee7f75f070..8f35054a0799 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,7 @@ markers = [ "distributed: run this test only in distributed GPU tests", "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", + "encoder_decoder: tests that use encoder-decoder models, skipped on ROCm", ] [tool.ty.src] diff --git a/tests/conftest.py b/tests/conftest.py index 5e127e4e939e..637a96e62c0c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1250,13 +1250,42 @@ def pytest_addoption(parser): def pytest_collection_modifyitems(config, items): - if config.getoption("--optional"): - # --optional given in cli: do not skip optional tests - return - skip_optional = pytest.mark.skip(reason="need --optional option to run") - for item in items: - if "optional" in item.keywords: - item.add_marker(skip_optional) + if not config.getoption("--optional"): + skip_optional = pytest.mark.skip(reason="need --optional option to run") + for item in items: + if "optional" in item.keywords: + item.add_marker(skip_optional) + + from vllm.platforms import current_platform + + # Skip encoder-decoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442 + ENCODER_DECODER_MODELS = [ + # Encoder-decoder models + "openai/whisper-small", + "openai/whisper-large-v3-turbo", + # Encoder-only models (cross-encoders, embedding models) + "cross-encoder/ms-marco-MiniLM-L-6-v2", + "intfloat/e5-small", + "intfloat/multilingual-e5-small", + "BAAI/bge-reranker-base", + "BAAI/bge-reranker-v2-m3", + "BAAI/bge-base-en-v1.5", + "Snowflake/snowflake-arctic-embed-m-v1.5", + "sentence-transformers/all-MiniLM-L12-v2", + "sentence-transformers/stsb-roberta-base-v2", + ] + + if current_platform.is_rocm(): + skip_encoder_decoder = pytest.mark.skip( + reason="Encoder-decoder models not supported on ROCm (all ROCm attention backends only support decoder-only models)" + ) + for item in items: + if "encoder_decoder" in item.keywords: + if any( + encoder_model in item.nodeid + for encoder_model in ENCODER_DECODER_MODELS + ): + item.add_marker(skip_encoder_decoder) @pytest.fixture(scope="session") diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py index 3ddf2308eb1d..27dc9a242f27 100644 --- a/tests/entrypoints/openai/test_enable_force_include_usage.py +++ b/tests/entrypoints/openai/test_enable_force_include_usage.py @@ -71,8 +71,8 @@ async def test_chat_with_enable_force_include_usage( assert chunk.usage is None -@pytest.fixture(scope="module") -def transcription_server_with_force_include_usage(): +@pytest.fixture(scope="module", params=["openai/whisper-large-v3-turbo"]) +def transcription_server_with_force_include_usage(request): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -85,7 +85,7 @@ def transcription_server_with_force_include_usage(): "0.2", ] - with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server: + with RemoteOpenAIServer(request.param, args) as remote_server: yield remote_server @@ -100,6 +100,7 @@ async def transcription_client_with_force_include_usage( @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_transcription_with_enable_force_include_usage( transcription_client_with_force_include_usage, winning_call ): diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 2f678a0535cc..d809d35a31e1 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -44,6 +44,13 @@ def test_empty_file(): + from vllm.platforms import current_platform + + if current_platform.is_rocm(): + pytest.skip( + "intfloat/multilingual-e5-small is encoder-only, not supported on ROCm" + ) + with ( tempfile.NamedTemporaryFile("w") as input_file, tempfile.NamedTemporaryFile("r") as output_file, @@ -128,6 +135,13 @@ def test_completions_invalid_input(): def test_embeddings(): + from vllm.platforms import current_platform + + if current_platform.is_rocm(): + pytest.skip( + "intfloat/multilingual-e5-small is encoder-only, not supported on ROCm" + ) + with ( tempfile.NamedTemporaryFile("w") as input_file, tempfile.NamedTemporaryFile("r") as output_file, @@ -159,6 +173,11 @@ def test_embeddings(): @pytest.mark.parametrize("input_batch", [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH]) def test_score(input_batch): + from vllm.platforms import current_platform + + if current_platform.is_rocm(): + pytest.skip("BAAI/bge-reranker-v2-m3 is encoder-only, not supported on ROCm") + with ( tempfile.NamedTemporaryFile("w") as input_file, tempfile.NamedTemporaryFile("r") as output_file, diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 88580ed899f1..cc5eae216a66 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -14,7 +14,6 @@ from ...utils import RemoteOpenAIServer -MODEL_NAME = "openai/whisper-large-v3-turbo" SERVER_ARGS = ["--enforce-eager"] MISTRAL_FORMAT_ARGS = [ "--tokenizer_mode", @@ -26,19 +25,21 @@ ] -@pytest.fixture(scope="module") -def server(): - with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: - yield remote_server +@pytest.fixture(scope="module", params=["openai/whisper-large-v3-turbo"]) +def server(request): + with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server: + yield remote_server, request.param @pytest_asyncio.fixture -async def client(server): +async def client_and_model(server): + server, model_name = server async with server.get_async_client() as async_client: - yield async_client + yield async_client, model_name @pytest.mark.asyncio +@pytest.mark.encoder_decoder @pytest.mark.parametrize( "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"] ) @@ -66,6 +67,7 @@ async def test_basic_audio(mary_had_lamb, model_name): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_basic_audio_with_lora(mary_had_lamb): """Ensure STT (transcribe) requests can pass LoRA through to generate.""" model_name = "ibm-granite/granite-speech-3.3-2b" @@ -137,16 +139,20 @@ async def test_non_asr_model(winning_call): @pytest.mark.asyncio -async def test_bad_requests(mary_had_lamb, client): +@pytest.mark.encoder_decoder +async def test_bad_requests(mary_had_lamb, client_and_model): + client, model_name = client_and_model # invalid language with pytest.raises(openai.BadRequestError): await client.audio.transcriptions.create( - model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0 + model=model_name, file=mary_had_lamb, language="hh", temperature=0.0 ) @pytest.mark.asyncio -async def test_long_audio_request(mary_had_lamb, client): +@pytest.mark.encoder_decoder +async def test_long_audio_request(mary_had_lamb, client_and_model): + client, model_name = client_and_model mary_had_lamb.seek(0) audio, sr = librosa.load(mary_had_lamb) # Add small silence after each audio for repeatability in the split process @@ -157,7 +163,7 @@ async def test_long_audio_request(mary_had_lamb, client): sf.write(buffer, repeated_audio, sr, format="WAV") buffer.seek(0) transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=buffer, language="en", response_format="text", @@ -172,34 +178,38 @@ async def test_long_audio_request(mary_had_lamb, client): @pytest.mark.asyncio -async def test_completion_endpoints(client): +@pytest.mark.encoder_decoder +async def test_completion_endpoints(client_and_model): + client, model_name = client_and_model # text to text model res = await client.chat.completions.create( - model=MODEL_NAME, + model=model_name, messages=[{"role": "system", "content": "You are a helpful assistant."}], ) err = res.error assert err["code"] == 400 assert err["message"] == "The model does not support Chat Completions API" - res = await client.completions.create(model=MODEL_NAME, prompt="Hello") + res = await client.completions.create(model=model_name, prompt="Hello") err = res.error assert err["code"] == 400 assert err["message"] == "The model does not support Completions API" @pytest.mark.asyncio -async def test_streaming_response(winning_call, client): +@pytest.mark.encoder_decoder +async def test_streaming_response(winning_call, client_and_model): + client, model_name = client_and_model transcription = "" res_no_stream = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=winning_call, response_format="json", language="en", temperature=0.0, ) res = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=winning_call, language="en", temperature=0.0, @@ -215,9 +225,11 @@ async def test_streaming_response(winning_call, client): @pytest.mark.asyncio -async def test_stream_options(winning_call, client): +@pytest.mark.encoder_decoder +async def test_stream_options(winning_call, client_and_model): + client, model_name = client_and_model res = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=winning_call, language="en", temperature=0.0, @@ -237,13 +249,15 @@ async def test_stream_options(winning_call, client): @pytest.mark.asyncio -async def test_sampling_params(mary_had_lamb, client): +@pytest.mark.encoder_decoder +async def test_sampling_params(mary_had_lamb, client_and_model): + client, model_name = client_and_model """ Compare sampling with params and greedy sampling to assert results are different when extreme sampling parameters values are picked. """ transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=mary_had_lamb, language="en", temperature=0.8, @@ -259,7 +273,7 @@ async def test_sampling_params(mary_had_lamb, client): ) greedy_transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=mary_had_lamb, language="en", temperature=0.0, @@ -270,12 +284,14 @@ async def test_sampling_params(mary_had_lamb, client): @pytest.mark.asyncio -async def test_audio_prompt(mary_had_lamb, client): +@pytest.mark.encoder_decoder +async def test_audio_prompt(mary_had_lamb, client_and_model): + client, model_name = client_and_model prompt = "This is a speech, recorded in a phonograph." # Prompts should not omit the part of original prompt while transcribing. prefix = "The first words I spoke in the original phonograph" transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=mary_had_lamb, language="en", response_format="text", @@ -284,7 +300,7 @@ async def test_audio_prompt(mary_had_lamb, client): out = json.loads(transcription)["text"] assert prefix in out transcription_wprompt = await client.audio.transcriptions.create( - model=MODEL_NAME, + model=model_name, file=mary_had_lamb, language="en", response_format="text", diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index c060ee2b1922..16e6c142534a 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -84,6 +84,7 @@ async def test_basic_audio_with_lora(mary_had_lamb): # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation! @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_basic_audio(foscolo, client_and_model): client, model_name = client_and_model translation = await client.audio.translations.create( @@ -99,6 +100,7 @@ async def test_basic_audio(foscolo, client_and_model): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_audio_prompt(foscolo, client_and_model): client, model_name = client_and_model # Condition whisper on starting text @@ -117,6 +119,7 @@ async def test_audio_prompt(foscolo, client_and_model): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_streaming_response(foscolo, client_and_model, server): client, model_name = client_and_model translation = "" @@ -168,6 +171,7 @@ async def test_streaming_response(foscolo, client_and_model, server): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_stream_options(foscolo, server): server, model_name = server url = server.url_for("v1/audio/translations") @@ -207,6 +211,7 @@ async def test_stream_options(foscolo, server): @pytest.mark.asyncio +@pytest.mark.encoder_decoder async def test_long_audio_request(foscolo, client_and_model): client, model_name = client_and_model if model_name == "google/gemma-3n-E2B-it": diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index c8deffbf66db..fd4095188c28 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -36,14 +36,24 @@ pytest.mark.core_model, pytest.mark.cpu_model, pytest.mark.slow_test, + pytest.mark.encoder_decoder, ], ), - pytest.param("sentence-transformers/all-MiniLM-L12-v2"), - pytest.param("intfloat/multilingual-e5-small"), + pytest.param( + "sentence-transformers/all-MiniLM-L12-v2", + marks=[pytest.mark.encoder_decoder], + ), + pytest.param( + "intfloat/multilingual-e5-small", marks=[pytest.mark.encoder_decoder] + ), # [Cross-Encoder] pytest.param( "sentence-transformers/stsb-roberta-base-v2", - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[ + pytest.mark.core_model, + pytest.mark.cpu_model, + pytest.mark.encoder_decoder, + ], ), ], )