FreedomIntelligence · JuhaoLiang1997 · May 19, 2026 · May 15, 2026 · May 18, 2026 · May 19, 2026
diff --git a/README.md b/README.md
@@ -88,6 +88,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 | Hardware | Runner folder | Framework | A | B | C | D | E | F | G |
 |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
 | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| NVIDIA GPU | `nvidia_vllm020_0f6c56e4` | vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ |
 | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | NVIDIA V100 (SM70) | `nvidia_onecat_vllm_12a253c2` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
 | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |

diff --git a/configs/runner_configs/runner_nvidia_vllm020_0f6c56e4.yaml.example b/configs/runner_configs/runner_nvidia_vllm020_0f6c56e4.yaml.example
@@ -0,0 +1,82 @@
+# AccelMark runner config — nvidia_vllm020_0f6c56e4 (vLLM 0.20 on NVIDIA)
+#
+# Copy this file to runner_nvidia_vllm020_0f6c56e4.yaml (remove .example suffix)
+# and edit as needed for your hardware. The actual .yaml is gitignored.
+#
+# These settings adapt the runner to your hardware environment.
+# They are recorded in result.json task.extra_config for transparency
+# but are NOT part of the benchmark identity (not hashed into run_id).
+#
+# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
+
+# ── Global defaults (apply to all suites) ─────────────────────────────────────
+
+# Tensor parallel size — number of GPUs to use (default: 1)
+tensor_parallel_size: 1
+
+# Disable CUDAGraph/compilation. Required for pre-Ampere GPUs (V100, T4).
+# Set to true if you encounter CUDA graph errors on older hardware.
+enforce_eager: false
+
+# Maximum number of sequences in a batch (default: 512).
+# Reduce on low-memory GPUs: 128 for 16 GB, 64 for 12 GB or less.
+max_num_seqs: 512
+
+# Fraction of GPU memory reserved for the KV cache (default: 0.90).
+# Reduce if you get OOM errors: try 0.80 for tighter memory budgets.
+gpu_memory_utilization: 0.90
+
+# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs().
+# Use for any vLLM setting not listed above. See vLLM docs for valid keys:
+# https://docs.vllm.ai/en/latest/api/vllm/engine/arg_utils.html
+#
+# 0.20-specific knobs you may want to set (uncomment as needed):
+# engine_kwargs:
+#   # FlashAttention 4 is the 0.20 default for MLA prefill; uncomment to pin
+#   # for reproducibility or to force back to FA3 / Triton fallback.
+#   # attention_backend: FLASH_ATTN_4
+#
+#   # Model Runner V2 + new CUDA-graph paths:
+#   # compilation_config:
+#   #   cudagraph_mode: full_and_piecewise
+#
+#   # TurboQuant 2-bit KV cache (suite_C, --precision turboquant):
+#   # kv_cache_dtype: turboquant
+#
+#   swap_space: 8
+#   max_seq_len_to_capture: 4096
+
+# ── Suite-specific overrides ───────────────────────────────────────────────────
+# Keys here override the global defaults above for a specific suite only.
+# Only the section matching the current suite is used — other suite sections
+# are never loaded or recorded.
+
+suites:
+  suite_C:
+    # Quantization suite (FP8/W8A8/W8A16 via compressed-tensors).
+    # enforce_eager disables CUDA graphs — required for W8A8/W8A16 accuracy on vLLM 0.20.
+    # Note: FP8 still fails on Ampere (A100, sm < 8.9): vLLM 0.20 uses broken Marlin
+    # weight-only FP8 fallback. Use H100+ for Suite C FP8, or vLLM 0.7.3 runner on A100.
+    enforce_eager: true
+
+  suite_D:
+    # Long-context suite — reduce batch size and reserve more memory.
+    max_num_seqs: 64
+    gpu_memory_utilization: 0.85
+
+  suite_F:
+    # Consumer/edge GPU — enforce_eager often needed for pre-Ampere chips
+    # enforce_eager: true
+    max_num_seqs: 128
+
+# ── Speculative decoding (suite_A extra scenario) ─────────────────────────────
+# Uncomment this section to run the speculative scenario.
+# The draft model runs on the same GPU as the target model.
+# speculative decoding is configured via vLLM engine_kwargs.
+#
+# suites:
+#   suite_A:
+#     engine_kwargs:
+#       speculative_model: "meta-llama/Llama-3.2-1B-Instruct"
+#       num_speculative_tokens: 4
+#       speculative_draft_tensor_parallel_size: 1
diff --git a/...d/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/accuracy/accuracy.json b/...d/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.61,
+  "baseline_delta": 0.01,
+  "valid": true,
+  "framework": "vLLM",
+  "precision": "BF16",
+  "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark."
+}
diff --git a/...rified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/burst/result.json b/...rified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/burst/result.json
@@ -0,0 +1,160 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "nvidia_vllm020_0f6c56e4",
+  "chip": {
+    "name": "NVIDIA A100-SXM4-80GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 80.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T04:31:01.283634+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "NVIDIA A100-SXM4-80GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 80.0,
+        "driver_version": "580.65.06",
+        "firmware_version": null,
+        "compute_capability": "8.0",
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n  NIC0: mlx5_0\n  NIC1: mlx5_1\n  NIC2: mlx5_2\n\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "AMD EPYC 7742 64-Core Processor",
+      "physical_cores": 128,
+      "logical_cores": 255,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.7,
+    "pcie_generation": "PCIe Gen 4",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_2",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu 22.04.4 LTS",
+    "python_version": "3.12.0",
+    "kernel_version": "5.15.0-60-generic",
+    "runtime_version": "CUDA 13.0",
+    "pytorch_version": "2.11.0+cu130"
+  },
+  "software": {
+    "framework": "vLLM",
+    "framework_version": "0.20.1+transformers-5.8.1",
+    "driver_version": "580.65.06",
+    "runtime_version": "CUDA 13.0",
+    "os": "Ubuntu 22.04.4 LTS",
+    "python_version": "3.12.0"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "BF16",
+    "effective_dtype": null,
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "burst",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "burst": {
+      "sla_ttft_ms": 500,
+      "burst_steady_qps": 5,
+      "burst_peak_qps": 25,
+      "burst_duration_seconds": 30,
+      "burst_interval_seconds": 120,
+      "steady_requests_total": 1812,
+      "burst_requests_total": 2245,
+      "steady_ttft_p50_ms": 39.39,
+      "steady_ttft_p99_ms": 79.1,
+      "burst_ttft_p50_ms": 7082.87,
+      "burst_ttft_p99_ms": 17212.99,
+      "sla_met_during_burst": false,
+      "burst_degradation_ratio": 217.605,
+      "results_by_cycle": [
+        {
+          "cycle": 1,
+          "steady_requests": 581,
+          "burst_requests": 760,
+          "steady_ttft_p99_ms": 89.81,
+          "burst_ttft_p99_ms": 17855.37
+        },
+        {
+          "cycle": 2,
+          "steady_requests": 595,
+          "burst_requests": 734,
+          "steady_ttft_p99_ms": 47.72,
+          "burst_ttft_p99_ms": 16592.12
+        },
+        {
+          "cycle": 3,
+          "steady_requests": 636,
+          "burst_requests": 751,
+          "steady_ttft_p99_ms": 48.05,
+          "burst_ttft_p99_ms": 16579.9
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "05:55:58",
+    "run_id": "8f83bfab",
+    "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T05:46:54.960197+00:00",
+    "benchmark_end_time": "2026-05-18T05:55:58.450157+00:00",
+    "benchmark_elapsed_minutes": 9.1,
+    "model_load_seconds": 39.5
+  }
+}
diff --git a/...s/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/env_info.json b/...s/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/env_info.json
@@ -0,0 +1,49 @@
+{
+  "collected_at": "2026-05-18T04:31:01.283634+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "vendor": "NVIDIA",
+      "memory_gb": 80.0,
+      "driver_version": "580.65.06",
+      "firmware_version": null,
+      "compute_capability": "8.0",
+      "supports_bf16": true
+    }
+  ],
+  "accelerator_platform": "nvidia",
+  "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n  NIC0: mlx5_0\n  NIC1: mlx5_1\n  NIC2: mlx5_2\n\n",
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "AMD EPYC 7742 64-Core Processor",
+    "physical_cores": 128,
+    "logical_cores": 255,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 1007.7,
+  "pcie_generation": "PCIe Gen 4",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": [
+    {
+      "name": "mlx5_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_1",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_2",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    }
+  ],
+  "os": "Ubuntu 22.04.4 LTS",
+  "python_version": "3.12.0",
+  "kernel_version": "5.15.0-60-generic",
+  "runtime_version": "CUDA 13.0",
+  "pytorch_version": "2.11.0+cu130"
+}