Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 27 additions & 6 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ steps:

- label: Entrypoints Integration Test (API Server) # 100min
timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/tests"
Expand All @@ -157,12 +157,21 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- >-
pytest -v -s entrypoints/openai
--ignore=entrypoints/openai/test_chat_with_tool_reasoning.py
--ignore=entrypoints/openai/test_oot_registration.py
--ignore=entrypoints/openai/test_tensorizer_entrypoint.py
--ignore=entrypoints/openai/correctness/
--ignore=entrypoints/openai/test_collective_rpc.py
--ignore=entrypoints/openai/tool_parsers/
--ignore=entrypoints/openai/test_optional_middleware.py
# Skip test_optional_middleware.py since encoder models are supported on ROCm: https://github.com/vllm-project/vllm/issues/27442
- pytest -v -s entrypoints/test_chat_utils.py

- label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
- label: Entrypoints Integration Test (Pooling) # 10min
timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/tests"
Expand All @@ -173,7 +182,19 @@ steps:
- tests/entrypoints/pooling
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
# Skip encoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442
- >-
pytest -v -s entrypoints/pooling
--ignore=entrypoints/pooling/correctness/test_mteb_score.py
--ignore=entrypoints/pooling/correctness/test_mteb_embed.py
--ignore=entrypoints/pooling/llm/test_embedding.py
--ignore=entrypoints/pooling/llm/test_encode.py
--ignore=entrypoints/pooling/openai/test_embedding.py
--ignore=entrypoints/pooling/openai/test_embedding_dimensions.py
--ignore=entrypoints/pooling/openai/test_embedding_long_text.py
--ignore=entrypoints/pooling/openai/test_rerank.py
--ignore=entrypoints/pooling/openai/test_score.py
--ignore=entrypoints/pooling/openai/test_truncation.py

- label: Distributed Tests (4 GPUs) # 35min
timeout_in_minutes: 50
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ markers = [
"distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1",
"optional: optional tests that are automatically skipped, include --optional to run them",
"encoder_decoder: tests that use encoder-decoder models, skipped on ROCm",
]

[tool.ty.src]
Expand Down
43 changes: 36 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,13 +1250,42 @@ def pytest_addoption(parser):


def pytest_collection_modifyitems(config, items):
if config.getoption("--optional"):
# --optional given in cli: do not skip optional tests
return
skip_optional = pytest.mark.skip(reason="need --optional option to run")
for item in items:
if "optional" in item.keywords:
item.add_marker(skip_optional)
if not config.getoption("--optional"):
skip_optional = pytest.mark.skip(reason="need --optional option to run")
for item in items:
if "optional" in item.keywords:
item.add_marker(skip_optional)

from vllm.platforms import current_platform

# Skip encoder-decoder models on ROCm: https://github.com/vllm-project/vllm/issues/27442
ENCODER_DECODER_MODELS = [
# Encoder-decoder models
"openai/whisper-small",
"openai/whisper-large-v3-turbo",
# Encoder-only models (cross-encoders, embedding models)
"cross-encoder/ms-marco-MiniLM-L-6-v2",
"intfloat/e5-small",
"intfloat/multilingual-e5-small",
"BAAI/bge-reranker-base",
"BAAI/bge-reranker-v2-m3",
"BAAI/bge-base-en-v1.5",
"Snowflake/snowflake-arctic-embed-m-v1.5",
"sentence-transformers/all-MiniLM-L12-v2",
"sentence-transformers/stsb-roberta-base-v2",
]

if current_platform.is_rocm():
skip_encoder_decoder = pytest.mark.skip(
reason="Encoder-decoder models not supported on ROCm (all ROCm attention backends only support decoder-only models)"
)
for item in items:
if "encoder_decoder" in item.keywords:
if any(
encoder_model in item.nodeid
for encoder_model in ENCODER_DECODER_MODELS
):
item.add_marker(skip_encoder_decoder)


@pytest.fixture(scope="session")
Expand Down
7 changes: 4 additions & 3 deletions tests/entrypoints/openai/test_enable_force_include_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ async def test_chat_with_enable_force_include_usage(
assert chunk.usage is None


@pytest.fixture(scope="module")
def transcription_server_with_force_include_usage():
@pytest.fixture(scope="module", params=["openai/whisper-large-v3-turbo"])
def transcription_server_with_force_include_usage(request):
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand All @@ -85,7 +85,7 @@ def transcription_server_with_force_include_usage():
"0.2",
]

with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
with RemoteOpenAIServer(request.param, args) as remote_server:
yield remote_server


Expand All @@ -100,6 +100,7 @@ async def transcription_client_with_force_include_usage(


@pytest.mark.asyncio
@pytest.mark.encoder_decoder
async def test_transcription_with_enable_force_include_usage(
transcription_client_with_force_include_usage, winning_call
):
Expand Down
19 changes: 19 additions & 0 deletions tests/entrypoints/openai/test_run_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@


def test_empty_file():
from vllm.platforms import current_platform

if current_platform.is_rocm():
pytest.skip(
"intfloat/multilingual-e5-small is encoder-only, not supported on ROCm"
)

with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
Expand Down Expand Up @@ -128,6 +135,13 @@ def test_completions_invalid_input():


def test_embeddings():
from vllm.platforms import current_platform

if current_platform.is_rocm():
pytest.skip(
"intfloat/multilingual-e5-small is encoder-only, not supported on ROCm"
)

with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
Expand Down Expand Up @@ -159,6 +173,11 @@ def test_embeddings():

@pytest.mark.parametrize("input_batch", [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
def test_score(input_batch):
from vllm.platforms import current_platform

if current_platform.is_rocm():
pytest.skip("BAAI/bge-reranker-v2-m3 is encoder-only, not supported on ROCm")

with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
Expand Down
66 changes: 41 additions & 25 deletions tests/entrypoints/openai/test_transcription_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from ...utils import RemoteOpenAIServer

MODEL_NAME = "openai/whisper-large-v3-turbo"
SERVER_ARGS = ["--enforce-eager"]
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode",
Expand All @@ -26,19 +25,21 @@
]


@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
yield remote_server
@pytest.fixture(scope="module", params=["openai/whisper-large-v3-turbo"])
def server(request):
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
yield remote_server, request.param


@pytest_asyncio.fixture
async def client(server):
async def client_and_model(server):
server, model_name = server
async with server.get_async_client() as async_client:
yield async_client
yield async_client, model_name


@pytest.mark.asyncio
@pytest.mark.encoder_decoder
@pytest.mark.parametrize(
"model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
)
Expand Down Expand Up @@ -66,6 +67,7 @@ async def test_basic_audio(mary_had_lamb, model_name):


@pytest.mark.asyncio
@pytest.mark.encoder_decoder
async def test_basic_audio_with_lora(mary_had_lamb):
"""Ensure STT (transcribe) requests can pass LoRA through to generate."""
model_name = "ibm-granite/granite-speech-3.3-2b"
Expand Down Expand Up @@ -137,16 +139,20 @@ async def test_non_asr_model(winning_call):


@pytest.mark.asyncio
async def test_bad_requests(mary_had_lamb, client):
@pytest.mark.encoder_decoder
async def test_bad_requests(mary_had_lamb, client_and_model):
client, model_name = client_and_model
# invalid language
with pytest.raises(openai.BadRequestError):
await client.audio.transcriptions.create(
model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
model=model_name, file=mary_had_lamb, language="hh", temperature=0.0
)


@pytest.mark.asyncio
async def test_long_audio_request(mary_had_lamb, client):
@pytest.mark.encoder_decoder
async def test_long_audio_request(mary_had_lamb, client_and_model):
client, model_name = client_and_model
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
Expand All @@ -157,7 +163,7 @@ async def test_long_audio_request(mary_had_lamb, client):
sf.write(buffer, repeated_audio, sr, format="WAV")
buffer.seek(0)
transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
model=model_name,
file=buffer,
language="en",
response_format="text",
Expand All @@ -172,34 +178,38 @@ async def test_long_audio_request(mary_had_lamb, client):


@pytest.mark.asyncio
async def test_completion_endpoints(client):
@pytest.mark.encoder_decoder
async def test_completion_endpoints(client_and_model):
client, model_name = client_and_model
# text to text model
res = await client.chat.completions.create(
model=MODEL_NAME,
model=model_name,
messages=[{"role": "system", "content": "You are a helpful assistant."}],
)
err = res.error
assert err["code"] == 400
assert err["message"] == "The model does not support Chat Completions API"

res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
res = await client.completions.create(model=model_name, prompt="Hello")
err = res.error
assert err["code"] == 400
assert err["message"] == "The model does not support Completions API"


@pytest.mark.asyncio
async def test_streaming_response(winning_call, client):
@pytest.mark.encoder_decoder
async def test_streaming_response(winning_call, client_and_model):
client, model_name = client_and_model
transcription = ""
res_no_stream = await client.audio.transcriptions.create(
model=MODEL_NAME,
model=model_name,
file=winning_call,
response_format="json",
language="en",
temperature=0.0,
)
res = await client.audio.transcriptions.create(
model=MODEL_NAME,
model=model_name,
file=winning_call,
language="en",
temperature=0.0,
Expand All @@ -215,9 +225,11 @@ async def test_streaming_response(winning_call, client):


@pytest.mark.asyncio
async def test_stream_options(winning_call, client):
@pytest.mark.encoder_decoder
async def test_stream_options(winning_call, client_and_model):
client, model_name = client_and_model
res = await client.audio.transcriptions.create(
model=MODEL_NAME,
model=model_name,
file=winning_call,
language="en",
temperature=0.0,
Expand All @@ -237,13 +249,15 @@ async def test_stream_options(winning_call, client):


@pytest.mark.asyncio
async def test_sampling_params(mary_had_lamb, client):
@pytest.mark.encoder_decoder
async def test_sampling_params(mary_had_lamb, client_and_model):
client, model_name = client_and_model
"""
Compare sampling with params and greedy sampling to assert results
are different when extreme sampling parameters values are picked.
"""
transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
model=model_name,
file=mary_had_lamb,
language="en",
temperature=0.8,
Expand All @@ -259,7 +273,7 @@ async def test_sampling_params(mary_had_lamb, client):
)

greedy_transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
model=model_name,
file=mary_had_lamb,
language="en",
temperature=0.0,
Expand All @@ -270,12 +284,14 @@ async def test_sampling_params(mary_had_lamb, client):


@pytest.mark.asyncio
async def test_audio_prompt(mary_had_lamb, client):
@pytest.mark.encoder_decoder
async def test_audio_prompt(mary_had_lamb, client_and_model):
client, model_name = client_and_model
prompt = "This is a speech, recorded in a phonograph."
# Prompts should not omit the part of original prompt while transcribing.
prefix = "The first words I spoke in the original phonograph"
transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
model=model_name,
file=mary_had_lamb,
language="en",
response_format="text",
Expand All @@ -284,7 +300,7 @@ async def test_audio_prompt(mary_had_lamb, client):
out = json.loads(transcription)["text"]
assert prefix in out
transcription_wprompt = await client.audio.transcriptions.create(
model=MODEL_NAME,
model=model_name,
file=mary_had_lamb,
language="en",
response_format="text",
Expand Down
Loading