Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
| Hardware | Runner folder | Framework | A | B | C | D | E | F | G |
|---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
| NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
| NVIDIA GPU | `nvidia_vllm020_0f6c56e4` | vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ |
| NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_12a253c2` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
| AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
Expand Down
82 changes: 82 additions & 0 deletions configs/runner_configs/runner_nvidia_vllm020_0f6c56e4.yaml.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# AccelMark runner config — nvidia_vllm020_0f6c56e4 (vLLM 0.20 on NVIDIA)
#
# Copy this file to runner_nvidia_vllm020_0f6c56e4.yaml (remove .example suffix)
# and edit as needed for your hardware. The actual .yaml is gitignored.
#
# These settings adapt the runner to your hardware environment.
# They are recorded in result.json task.extra_config for transparency
# but are NOT part of the benchmark identity (not hashed into run_id).
#
# Merge priority: CLI flags > suite-specific > global defaults > runner defaults

# ── Global defaults (apply to all suites) ─────────────────────────────────────

# Tensor parallel size — number of GPUs to use (default: 1)
tensor_parallel_size: 1

# Disable CUDAGraph/compilation. Required for pre-Ampere GPUs (V100, T4).
# Set to true if you encounter CUDA graph errors on older hardware.
enforce_eager: false

# Maximum number of sequences in a batch (default: 512).
# Reduce on low-memory GPUs: 128 for 16 GB, 64 for 12 GB or less.
max_num_seqs: 512

# Fraction of GPU memory reserved for the KV cache (default: 0.90).
# Reduce if you get OOM errors: try 0.80 for tighter memory budgets.
gpu_memory_utilization: 0.90

# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs().
# Use for any vLLM setting not listed above. See vLLM docs for valid keys:
# https://docs.vllm.ai/en/latest/api/vllm/engine/arg_utils.html
#
# 0.20-specific knobs you may want to set (uncomment as needed):
# engine_kwargs:
# # FlashAttention 4 is the 0.20 default for MLA prefill; uncomment to pin
# # for reproducibility or to force back to FA3 / Triton fallback.
# # attention_backend: FLASH_ATTN_4
#
# # Model Runner V2 + new CUDA-graph paths:
# # compilation_config:
# # cudagraph_mode: full_and_piecewise
#
# # TurboQuant 2-bit KV cache (suite_C, --precision turboquant):
# # kv_cache_dtype: turboquant
#
# swap_space: 8
# max_seq_len_to_capture: 4096

# ── Suite-specific overrides ───────────────────────────────────────────────────
# Keys here override the global defaults above for a specific suite only.
# Only the section matching the current suite is used — other suite sections
# are never loaded or recorded.

suites:
suite_C:
# Quantization suite (FP8/W8A8/W8A16 via compressed-tensors).
# enforce_eager disables CUDA graphs — required for W8A8/W8A16 accuracy on vLLM 0.20.
# Note: FP8 still fails on Ampere (A100, sm < 8.9): vLLM 0.20 uses broken Marlin
# weight-only FP8 fallback. Use H100+ for Suite C FP8, or vLLM 0.7.3 runner on A100.
enforce_eager: true

suite_D:
# Long-context suite — reduce batch size and reserve more memory.
max_num_seqs: 64
gpu_memory_utilization: 0.85

suite_F:
# Consumer/edge GPU — enforce_eager often needed for pre-Ampere chips
# enforce_eager: true
max_num_seqs: 128

# ── Speculative decoding (suite_A extra scenario) ─────────────────────────────
# Uncomment this section to run the speculative scenario.
# The draft model runs on the same GPU as the target model.
# speculative decoding is configured via vLLM engine_kwargs.
#
# suites:
# suite_A:
# engine_kwargs:
# speculative_model: "meta-llama/Llama-3.2-1B-Instruct"
# num_speculative_tokens: 4
# speculative_draft_tensor_parallel_size: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"subset_score": 0.61,
"baseline_delta": 0.01,
"valid": true,
"framework": "vLLM",
"precision": "BF16",
"notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
{
"schema_version": "1.0",
"suite_id": "suite_A",
"implementation_id": "nvidia_vllm020_0f6c56e4",
"chip": {
"name": "NVIDIA A100-SXM4-80GB",
"vendor": "NVIDIA",
"count": 1,
"memory_gb": 80.0,
"interconnect_intra_node": null,
"interconnect_inter_node": null
},
"environment": {
"collected_at": "2026-05-18T04:31:01.283634+00:00",
"accelerators": [
{
"index": 0,
"name": "NVIDIA A100-SXM4-80GB",
"vendor": "NVIDIA",
"memory_gb": 80.0,
"driver_version": "580.65.06",
"firmware_version": null,
"compute_capability": "8.0",
"supports_bf16": true
}
],
"accelerator_platform": "nvidia",
"accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n",
"intra_node_interconnect": null,
"cpu": {
"model": "AMD EPYC 7742 64-Core Processor",
"physical_cores": 128,
"logical_cores": 255,
"numa_nodes": 2
},
"system_memory_gb": 1007.7,
"pcie_generation": "PCIe Gen 4",
"cpu_accelerator_bandwidth_gbs": null,
"network_interfaces": [
{
"name": "mlx5_0",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_1",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_2",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
}
],
"os": "Ubuntu 22.04.4 LTS",
"python_version": "3.12.0",
"kernel_version": "5.15.0-60-generic",
"runtime_version": "CUDA 13.0",
"pytorch_version": "2.11.0+cu130"
},
"software": {
"framework": "vLLM",
"framework_version": "0.20.1+transformers-5.8.1",
"driver_version": "580.65.06",
"runtime_version": "CUDA 13.0",
"os": "Ubuntu 22.04.4 LTS",
"python_version": "3.12.0"
},
"model": {
"model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
"model_name": null,
"model_note": null,
"model_source": "local",
"architecture": "dense",
"parameter_count_b": 8.0,
"precision": "BF16",
"effective_dtype": null,
"quantization_method": null,
"model_format": "HuggingFace original"
},
"task": {
"scenario": "burst",
"num_runs": 3,
"warmup_runs": 1,
"parallelism": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"expert_parallel_size": 1,
"data_parallel_size": 1
},
"extra_config": null,
"runtime_metrics": null
},
"metrics": {
"burst": {
"sla_ttft_ms": 500,
"burst_steady_qps": 5,
"burst_peak_qps": 25,
"burst_duration_seconds": 30,
"burst_interval_seconds": 120,
"steady_requests_total": 1812,
"burst_requests_total": 2245,
"steady_ttft_p50_ms": 39.39,
"steady_ttft_p99_ms": 79.1,
"burst_ttft_p50_ms": 7082.87,
"burst_ttft_p99_ms": 17212.99,
"sla_met_during_burst": false,
"burst_degradation_ratio": 217.605,
"results_by_cycle": [
{
"cycle": 1,
"steady_requests": 581,
"burst_requests": 760,
"steady_ttft_p99_ms": 89.81,
"burst_ttft_p99_ms": 17855.37
},
{
"cycle": 2,
"steady_requests": 595,
"burst_requests": 734,
"steady_ttft_p99_ms": 47.72,
"burst_ttft_p99_ms": 16592.12
},
{
"cycle": 3,
"steady_requests": 636,
"burst_requests": 751,
"steady_ttft_p99_ms": 48.05,
"burst_ttft_p99_ms": 16579.9
}
]
}
},
"accuracy": {
"subset_score": null,
"baseline_delta": null,
"valid": false,
"notes": "Run --scenario accuracy to check model accuracy."
},
"meta": {
"submitted_by": "JuhaoLiang1997",
"submission_type": "individual",
"date": "2026-05-18",
"time": "05:55:58",
"run_id": "8f83bfab",
"run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab",
"flagged": null,
"reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py",
"env_info_file": "../env_info.json",
"log_file": "run.log",
"samples_file": "samples.jsonl",
"notes": null,
"benchmark_start_time": "2026-05-18T05:46:54.960197+00:00",
"benchmark_end_time": "2026-05-18T05:55:58.450157+00:00",
"benchmark_elapsed_minutes": 9.1,
"model_load_seconds": 39.5
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"collected_at": "2026-05-18T04:31:01.283634+00:00",
"accelerators": [
{
"index": 0,
"name": "NVIDIA A100-SXM4-80GB",
"vendor": "NVIDIA",
"memory_gb": 80.0,
"driver_version": "580.65.06",
"firmware_version": null,
"compute_capability": "8.0",
"supports_bf16": true
}
],
"accelerator_platform": "nvidia",
"accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n",
"intra_node_interconnect": null,
"cpu": {
"model": "AMD EPYC 7742 64-Core Processor",
"physical_cores": 128,
"logical_cores": 255,
"numa_nodes": 2
},
"system_memory_gb": 1007.7,
"pcie_generation": "PCIe Gen 4",
"cpu_accelerator_bandwidth_gbs": null,
"network_interfaces": [
{
"name": "mlx5_0",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_1",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_2",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
}
],
"os": "Ubuntu 22.04.4 LTS",
"python_version": "3.12.0",
"kernel_version": "5.15.0-60-generic",
"runtime_version": "CUDA 13.0",
"pytorch_version": "2.11.0+cu130"
}
Loading
Loading