diff --git a/README.md b/README.md index 922c479d..7fac6641 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t | Hardware | Runner folder | Framework | A | B | C | D | E | F | G | |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:| | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +| NVIDIA GPU | `nvidia_vllm020_0f6c56e4` | vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | NVIDIA V100 (SM70) | `nvidia_onecat_vllm_12a253c2` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ | | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | diff --git a/configs/runner_configs/runner_nvidia_vllm020_0f6c56e4.yaml.example b/configs/runner_configs/runner_nvidia_vllm020_0f6c56e4.yaml.example new file mode 100644 index 00000000..906f7d7c --- /dev/null +++ b/configs/runner_configs/runner_nvidia_vllm020_0f6c56e4.yaml.example @@ -0,0 +1,82 @@ +# AccelMark runner config — nvidia_vllm020_0f6c56e4 (vLLM 0.20 on NVIDIA) +# +# Copy this file to runner_nvidia_vllm020_0f6c56e4.yaml (remove .example suffix) +# and edit as needed for your hardware. The actual .yaml is gitignored. +# +# These settings adapt the runner to your hardware environment. +# They are recorded in result.json task.extra_config for transparency +# but are NOT part of the benchmark identity (not hashed into run_id). +# +# Merge priority: CLI flags > suite-specific > global defaults > runner defaults + +# ── Global defaults (apply to all suites) ───────────────────────────────────── + +# Tensor parallel size — number of GPUs to use (default: 1) +tensor_parallel_size: 1 + +# Disable CUDAGraph/compilation. Required for pre-Ampere GPUs (V100, T4). +# Set to true if you encounter CUDA graph errors on older hardware. +enforce_eager: false + +# Maximum number of sequences in a batch (default: 512). +# Reduce on low-memory GPUs: 128 for 16 GB, 64 for 12 GB or less. +max_num_seqs: 512 + +# Fraction of GPU memory reserved for the KV cache (default: 0.90). +# Reduce if you get OOM errors: try 0.80 for tighter memory budgets. +gpu_memory_utilization: 0.90 + +# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs(). +# Use for any vLLM setting not listed above. See vLLM docs for valid keys: +# https://docs.vllm.ai/en/latest/api/vllm/engine/arg_utils.html +# +# 0.20-specific knobs you may want to set (uncomment as needed): +# engine_kwargs: +# # FlashAttention 4 is the 0.20 default for MLA prefill; uncomment to pin +# # for reproducibility or to force back to FA3 / Triton fallback. +# # attention_backend: FLASH_ATTN_4 +# +# # Model Runner V2 + new CUDA-graph paths: +# # compilation_config: +# # cudagraph_mode: full_and_piecewise +# +# # TurboQuant 2-bit KV cache (suite_C, --precision turboquant): +# # kv_cache_dtype: turboquant +# +# swap_space: 8 +# max_seq_len_to_capture: 4096 + +# ── Suite-specific overrides ─────────────────────────────────────────────────── +# Keys here override the global defaults above for a specific suite only. +# Only the section matching the current suite is used — other suite sections +# are never loaded or recorded. + +suites: + suite_C: + # Quantization suite (FP8/W8A8/W8A16 via compressed-tensors). + # enforce_eager disables CUDA graphs — required for W8A8/W8A16 accuracy on vLLM 0.20. + # Note: FP8 still fails on Ampere (A100, sm < 8.9): vLLM 0.20 uses broken Marlin + # weight-only FP8 fallback. Use H100+ for Suite C FP8, or vLLM 0.7.3 runner on A100. + enforce_eager: true + + suite_D: + # Long-context suite — reduce batch size and reserve more memory. + max_num_seqs: 64 + gpu_memory_utilization: 0.85 + + suite_F: + # Consumer/edge GPU — enforce_eager often needed for pre-Ampere chips + # enforce_eager: true + max_num_seqs: 128 + +# ── Speculative decoding (suite_A extra scenario) ───────────────────────────── +# Uncomment this section to run the speculative scenario. +# The draft model runs on the same GPU as the target model. +# speculative decoding is configured via vLLM engine_kwargs. +# +# suites: +# suite_A: +# engine_kwargs: +# speculative_model: "meta-llama/Llama-3.2-1B-Instruct" +# num_speculative_tokens: 4 +# speculative_draft_tensor_parallel_size: 1 diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/accuracy/accuracy.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/accuracy/accuracy.json new file mode 100644 index 00000000..d837d1a7 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.61, + "baseline_delta": 0.01, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/burst/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/burst/result.json new file mode 100644 index 00000000..133a65b6 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/burst/result.json @@ -0,0 +1,160 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T04:31:01.283634+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "burst", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 39.39, + "steady_ttft_p99_ms": 79.1, + "burst_ttft_p50_ms": 7082.87, + "burst_ttft_p99_ms": 17212.99, + "sla_met_during_burst": false, + "burst_degradation_ratio": 217.605, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 89.81, + "burst_ttft_p99_ms": 17855.37 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 47.72, + "burst_ttft_p99_ms": 16592.12 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 48.05, + "burst_ttft_p99_ms": 16579.9 + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "05:55:58", + "run_id": "8f83bfab", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T05:46:54.960197+00:00", + "benchmark_end_time": "2026-05-18T05:55:58.450157+00:00", + "benchmark_elapsed_minutes": 9.1, + "model_load_seconds": 39.5 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/env_info.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/env_info.json new file mode 100644 index 00000000..ccee9205 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/env_info.json @@ -0,0 +1,49 @@ +{ + "collected_at": "2026-05-18T04:31:01.283634+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/interactive/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/interactive/result.json new file mode 100644 index 00000000..5232cac0 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/interactive/result.json @@ -0,0 +1,132 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T04:31:01.283634+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 27.78, + "ttft_ms_p90": 42.95, + "ttft_ms_p99": 59.38, + "tpot_ms_p50": 10.77, + "tpot_ms_p90": 10.84, + "tpot_ms_p99": 10.86, + "peak_memory_gb": null, + "elapsed_seconds_median": 570.0 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "05:14:04", + "run_id": "8f83bfab", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T04:45:25.062974+00:00", + "benchmark_end_time": "2026-05-18T05:14:04.982045+00:00", + "benchmark_elapsed_minutes": 28.7, + "model_load_seconds": 40.4 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/offline/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/offline/result.json new file mode 100644 index 00000000..89283ef2 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/offline/result.json @@ -0,0 +1,165 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T04:31:01.283634+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 3871.19, + "throughput_tokens_per_sec_per_chip": 3871.19, + "throughput_tokens_per_sec_total": 6746.91, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 3916.69, + "throughput_tokens_per_sec_per_chip": 3916.69, + "throughput_tokens_per_sec_total": 6785.8, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 3908.22, + "throughput_tokens_per_sec_per_chip": 3908.22, + "throughput_tokens_per_sec_total": 6779.65, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "04:35:38", + "run_id": "8f83bfab", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T04:33:49.881300+00:00", + "benchmark_end_time": "2026-05-18T04:35:38.648555+00:00", + "benchmark_elapsed_minutes": 1.8, + "model_load_seconds": 58.5 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/online/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/online/result.json new file mode 100644 index 00000000..8fe97d2f --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/online/result.json @@ -0,0 +1,164 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T04:31:01.283634+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 5, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.2, + "ttft_ms_p90": 60.2, + "ttft_ms_p99": 92.29, + "tpot_ms_p50": 13.21, + "tpot_ms_p90": 14.2, + "tpot_ms_p99": 14.71, + "elapsed_seconds_median": 69.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 75.24, + "ttft_ms_p90": 4335.91, + "ttft_ms_p99": 5305.98, + "tpot_ms_p50": 22.28, + "tpot_ms_p90": 24.58, + "tpot_ms_p99": 26.25, + "elapsed_seconds_median": 25.0, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 1710.17, + "ttft_ms_p90": 10195.6, + "ttft_ms_p99": 10706.9, + "tpot_ms_p50": 22.18, + "tpot_ms_p90": 24.52, + "tpot_ms_p99": 28.04, + "elapsed_seconds_median": 22.3, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "04:42:53", + "run_id": "8f83bfab", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T04:37:07.775120+00:00", + "benchmark_end_time": "2026-05-18T04:42:53.648821+00:00", + "benchmark_elapsed_minutes": 5.8, + "model_load_seconds": 60.9 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/result.json new file mode 100644 index 00000000..ca26d93b --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/result.json @@ -0,0 +1,572 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T04:31:01.283634+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained", + "burst" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 3871.19, + "throughput_tokens_per_sec_per_chip": 3871.19, + "throughput_tokens_per_sec_total": 6746.91, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 3916.69, + "throughput_tokens_per_sec_per_chip": 3916.69, + "throughput_tokens_per_sec_total": 6785.8, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 3908.22, + "throughput_tokens_per_sec_per_chip": 3908.22, + "throughput_tokens_per_sec_total": 6779.65, + "elapsed_seconds_median": 8.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 5, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 40.2, + "ttft_ms_p90": 60.2, + "ttft_ms_p99": 92.29, + "tpot_ms_p50": 13.21, + "tpot_ms_p90": 14.2, + "tpot_ms_p99": 14.71, + "elapsed_seconds_median": 69.1, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 75.24, + "ttft_ms_p90": 4335.91, + "ttft_ms_p99": 5305.98, + "tpot_ms_p50": 22.28, + "tpot_ms_p90": 24.58, + "tpot_ms_p99": 26.25, + "elapsed_seconds_median": 25.0, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 1710.17, + "ttft_ms_p90": 10195.6, + "ttft_ms_p99": 10706.9, + "tpot_ms_p50": 22.18, + "tpot_ms_p90": 24.52, + "tpot_ms_p99": 28.04, + "elapsed_seconds_median": 22.3, + "sla_met": false + } + ] + }, + "interactive": { + "ttft_ms_p50": 27.78, + "ttft_ms_p90": 42.95, + "ttft_ms_p99": 59.38, + "tpot_ms_p50": 10.77, + "tpot_ms_p90": 10.84, + "tpot_ms_p99": 10.86, + "peak_memory_gb": null, + "elapsed_seconds_median": 570.0 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 666.3, + "tokens_out": 39988, + "tokens_in": 0, + "requests_completed": 115, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 345.6 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 707.4, + "tokens_out": 42459, + "tokens_in": 0, + "requests_completed": 126, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 40.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 710.2, + "tokens_out": 42613, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 35.9 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.3, + "tokens_out": 43675, + "tokens_in": 0, + "requests_completed": 126, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.1 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 707.4, + "tokens_out": 42459, + "tokens_in": 0, + "requests_completed": 121, + "ttft_ms_p50": 34.3, + "ttft_ms_p99": 41.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 711.2, + "tokens_out": 42670, + "tokens_in": 0, + "requests_completed": 128, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 40.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 708.9, + "tokens_out": 42534, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 35.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.6, + "tokens_out": 42083, + "tokens_in": 0, + "requests_completed": 121, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 40.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.0, + "tokens_out": 43388, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 39.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 702.0, + "tokens_out": 42108, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 39.2 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 716.4, + "tokens_out": 42995, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 39.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 718.5, + "tokens_out": 43125, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 40.7 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 716.5, + "tokens_out": 42976, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 708.7, + "tokens_out": 42543, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.1 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 711.1, + "tokens_out": 42666, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 36.9 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 715.6, + "tokens_out": 42902, + "tokens_in": 0, + "requests_completed": 120, + "ttft_ms_p50": 34.3, + "ttft_ms_p99": 36.2 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 699.2, + "tokens_out": 41971, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 37.1 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.0, + "tokens_out": 43276, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 39.7 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 689.9, + "tokens_out": 41386, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 36.0 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 718.4, + "tokens_out": 43086, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 36.1 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 720.0, + "tokens_out": 43224, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 36.0 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.1, + "tokens_out": 43543, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 40.6 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 713.8, + "tokens_out": 42835, + "tokens_in": 0, + "requests_completed": 128, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 36.2 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 694.3, + "tokens_out": 41650, + "tokens_in": 0, + "requests_completed": 119, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 38.5 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 709.4, + "tokens_out": 42580, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 40.6 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 720.1, + "tokens_out": 43188, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 36.1 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 714.8, + "tokens_out": 42892, + "tokens_in": 0, + "requests_completed": 126, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 40.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 705.6, + "tokens_out": 42347, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 40.7 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 725.1, + "tokens_out": 43505, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 40.7 + } + ], + "sustained_throughput_tokens_per_sec": 712.3, + "throttle_ratio": 0.947, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -0.2 + }, + "burst": { + "sla_ttft_ms": 500, + "burst_steady_qps": 5, + "burst_peak_qps": 25, + "burst_duration_seconds": 30, + "burst_interval_seconds": 120, + "steady_requests_total": 1812, + "burst_requests_total": 2245, + "steady_ttft_p50_ms": 39.39, + "steady_ttft_p99_ms": 79.1, + "burst_ttft_p50_ms": 7082.87, + "burst_ttft_p99_ms": 17212.99, + "sla_met_during_burst": false, + "burst_degradation_ratio": 217.605, + "results_by_cycle": [ + { + "cycle": 1, + "steady_requests": 581, + "burst_requests": 760, + "steady_ttft_p99_ms": 89.81, + "burst_ttft_p99_ms": 17855.37 + }, + { + "cycle": 2, + "steady_requests": 595, + "burst_requests": 734, + "steady_ttft_p99_ms": 47.72, + "burst_ttft_p99_ms": 16592.12 + }, + { + "cycle": 3, + "steady_requests": 636, + "burst_requests": 751, + "steady_ttft_p99_ms": 48.05, + "burst_ttft_p99_ms": 16579.9 + } + ] + } + }, + "accuracy": { + "subset_score": 0.61, + "baseline_delta": 0.01, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "04:35:38", + "run_id": "8f83bfab", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": "Partial run: ['offline', 'online', 'interactive', 'sustained', 'burst'] succeeded, ['speculative'] failed.", + "benchmark_start_time": "2026-05-18T04:33:49.881300+00:00", + "benchmark_end_time": "2026-05-18T04:35:38.648555+00:00", + "benchmark_elapsed_minutes": 75.5, + "model_load_seconds": 58.5, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained', 'burst'] scenarios.", + "scenario_dirs": { + "offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/offline", + "online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/online", + "interactive": "results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/interactive", + "sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/sustained", + "burst": "results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/burst" + } + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/sustained/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/sustained/result.json new file mode 100644 index 00000000..1fc95fb6 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab/sustained/result.json @@ -0,0 +1,424 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T04:31:01.283634+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 666.3, + "tokens_out": 39988, + "tokens_in": 0, + "requests_completed": 115, + "ttft_ms_p50": 43.8, + "ttft_ms_p99": 345.6 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 707.4, + "tokens_out": 42459, + "tokens_in": 0, + "requests_completed": 126, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 40.9 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 710.2, + "tokens_out": 42613, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 35.9 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.3, + "tokens_out": 43675, + "tokens_in": 0, + "requests_completed": 126, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.1 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 707.4, + "tokens_out": 42459, + "tokens_in": 0, + "requests_completed": 121, + "ttft_ms_p50": 34.3, + "ttft_ms_p99": 41.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 711.2, + "tokens_out": 42670, + "tokens_in": 0, + "requests_completed": 128, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 40.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 708.9, + "tokens_out": 42534, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 35.8 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.6, + "tokens_out": 42083, + "tokens_in": 0, + "requests_completed": 121, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 40.0 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 723.0, + "tokens_out": 43388, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 39.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 702.0, + "tokens_out": 42108, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 39.2 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 716.4, + "tokens_out": 42995, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 39.5 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 718.5, + "tokens_out": 43125, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 40.7 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 716.5, + "tokens_out": 42976, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.0 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 708.7, + "tokens_out": 42543, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.1 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 711.1, + "tokens_out": 42666, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 36.9 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 715.6, + "tokens_out": 42902, + "tokens_in": 0, + "requests_completed": 120, + "ttft_ms_p50": 34.3, + "ttft_ms_p99": 36.2 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 699.2, + "tokens_out": 41971, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 37.1 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 721.0, + "tokens_out": 43276, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 39.7 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 689.9, + "tokens_out": 41386, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 36.0 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 718.4, + "tokens_out": 43086, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 36.1 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 720.0, + "tokens_out": 43224, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 36.0 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.1, + "tokens_out": 43543, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 40.6 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 713.8, + "tokens_out": 42835, + "tokens_in": 0, + "requests_completed": 128, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 36.2 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 694.3, + "tokens_out": 41650, + "tokens_in": 0, + "requests_completed": 119, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 38.5 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 709.4, + "tokens_out": 42580, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.7, + "ttft_ms_p99": 40.6 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 720.1, + "tokens_out": 43188, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 36.1 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 714.8, + "tokens_out": 42892, + "tokens_in": 0, + "requests_completed": 126, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 40.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 705.6, + "tokens_out": 42347, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 40.7 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 725.1, + "tokens_out": 43505, + "tokens_in": 0, + "requests_completed": 125, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 40.7 + } + ], + "sustained_throughput_tokens_per_sec": 712.3, + "throttle_ratio": 0.947, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -0.2 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "05:45:22", + "run_id": "8f83bfab", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm020_0f6c56e4_8f83bfab", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T05:15:13.541588+00:00", + "benchmark_end_time": "2026-05-18T05:45:22.333860+00:00", + "benchmark_elapsed_minutes": 30.1, + "model_load_seconds": 41.7 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/accuracy/accuracy.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/accuracy/accuracy.json new file mode 100644 index 00000000..95fced50 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.56, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/offline/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/offline/result.json new file mode 100644 index 00000000..b2759764 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/offline/result.json @@ -0,0 +1,178 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3888.91, + "throughput_tokens_per_sec_per_chip": 3888.91, + "throughput_tokens_per_sec_total": 6956.79, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3886.06, + "throughput_tokens_per_sec_per_chip": 3886.06, + "throughput_tokens_per_sec_total": 6943.56, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3885.21, + "throughput_tokens_per_sec_per_chip": 3885.21, + "throughput_tokens_per_sec_total": 6935.62, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3887.73, + "throughput_tokens_per_sec_per_chip": 3887.73, + "throughput_tokens_per_sec_total": 6949.35, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "06:01:31", + "run_id": "ffd81462", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T05:59:03.115858+00:00", + "benchmark_end_time": "2026-05-18T06:01:31.820089+00:00", + "benchmark_elapsed_minutes": 2.5, + "model_load_seconds": 35.2 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/online/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/online/result.json new file mode 100644 index 00000000..fcd1a857 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/online/result.json @@ -0,0 +1,176 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 41.15, + "ttft_ms_p90": 61.6, + "ttft_ms_p99": 96.42, + "tpot_ms_p50": 13.32, + "tpot_ms_p90": 14.41, + "tpot_ms_p99": 14.81, + "elapsed_seconds_median": 68.9, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 52.06, + "ttft_ms_p90": 63.35, + "ttft_ms_p99": 69.96, + "tpot_ms_p50": 17.62, + "tpot_ms_p90": 18.95, + "tpot_ms_p99": 19.52, + "elapsed_seconds_median": 36.5, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 86.32, + "ttft_ms_p90": 5056.12, + "ttft_ms_p99": 5979.32, + "tpot_ms_p50": 22.47, + "tpot_ms_p90": 25.06, + "tpot_ms_p99": 26.76, + "elapsed_seconds_median": 26.1, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 891.06, + "ttft_ms_p90": 8527.64, + "ttft_ms_p99": 9213.57, + "tpot_ms_p50": 22.37, + "tpot_ms_p90": 24.86, + "tpot_ms_p99": 30.84, + "elapsed_seconds_median": 23.9, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "06:10:41", + "run_id": "ffd81462", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T06:02:58.126932+00:00", + "benchmark_end_time": "2026-05-18T06:10:41.342285+00:00", + "benchmark_elapsed_minutes": 7.7, + "model_load_seconds": 60.4 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/result.json new file mode 100644 index 00000000..75e68ff6 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/result.json @@ -0,0 +1,395 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3888.91, + "throughput_tokens_per_sec_per_chip": 3888.91, + "throughput_tokens_per_sec_total": 6956.79, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3886.06, + "throughput_tokens_per_sec_per_chip": 3886.06, + "throughput_tokens_per_sec_total": 6943.56, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3885.21, + "throughput_tokens_per_sec_per_chip": 3885.21, + "throughput_tokens_per_sec_total": 6935.62, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3887.73, + "throughput_tokens_per_sec_per_chip": 3887.73, + "throughput_tokens_per_sec_total": 6949.35, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 41.15, + "ttft_ms_p90": 61.6, + "ttft_ms_p99": 96.42, + "tpot_ms_p50": 13.32, + "tpot_ms_p90": 14.41, + "tpot_ms_p99": 14.81, + "elapsed_seconds_median": 68.9, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 52.06, + "ttft_ms_p90": 63.35, + "ttft_ms_p99": 69.96, + "tpot_ms_p50": 17.62, + "tpot_ms_p90": 18.95, + "tpot_ms_p99": 19.52, + "elapsed_seconds_median": 36.5, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 86.32, + "ttft_ms_p90": 5056.12, + "ttft_ms_p99": 5979.32, + "tpot_ms_p50": 22.47, + "tpot_ms_p90": 25.06, + "tpot_ms_p99": 26.76, + "elapsed_seconds_median": 26.1, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 891.06, + "ttft_ms_p90": 8527.64, + "ttft_ms_p99": 9213.57, + "tpot_ms_p50": 22.37, + "tpot_ms_p90": 24.86, + "tpot_ms_p99": 30.84, + "elapsed_seconds_median": 23.9, + "sla_met": false + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.0, + "tokens_out": 39334, + "tokens_in": 0, + "requests_completed": 109, + "ttft_ms_p50": 44.4, + "ttft_ms_p99": 401.8 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 711.0, + "tokens_out": 42652, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 42.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 698.3, + "tokens_out": 41902, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 718.8, + "tokens_out": 43114, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 36.5 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 724.4, + "tokens_out": 43451, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 41.7 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.8, + "tokens_out": 42133, + "tokens_in": 0, + "requests_completed": 115, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 706.9, + "tokens_out": 42401, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 42.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 720.6, + "tokens_out": 43232, + "tokens_in": 0, + "requests_completed": 120, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 41.5 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 697.1, + "tokens_out": 41830, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 43.9 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.2, + "tokens_out": 43597, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 35.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.9, + "tokens_out": 42083, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.3, + "tokens_out": 43715, + "tokens_in": 0, + "requests_completed": 121, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 43.1 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 688.9, + "tokens_out": 41331, + "tokens_in": 0, + "requests_completed": 119, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 717.5, + "tokens_out": 43059, + "tokens_in": 0, + "requests_completed": 119, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.7 + } + ], + "sustained_throughput_tokens_per_sec": 706.9, + "throttle_ratio": 0.899, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -360.1 + } + }, + "accuracy": { + "subset_score": 0.56, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "06:01:31", + "run_id": "ffd81462", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T05:59:03.115858+00:00", + "benchmark_end_time": "2026-05-18T06:01:31.820089+00:00", + "benchmark_elapsed_minutes": 25.3, + "model_load_seconds": 35.2, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/offline", + "online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/online", + "sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/sustained" + } + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/sustained/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/sustained/result.json new file mode 100644 index 00000000..ee4ebabf --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/sustained/result.json @@ -0,0 +1,274 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.0, + "tokens_out": 39334, + "tokens_in": 0, + "requests_completed": 109, + "ttft_ms_p50": 44.4, + "ttft_ms_p99": 401.8 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 711.0, + "tokens_out": 42652, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 42.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 698.3, + "tokens_out": 41902, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 718.8, + "tokens_out": 43114, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 36.5 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 724.4, + "tokens_out": 43451, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 41.7 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.8, + "tokens_out": 42133, + "tokens_in": 0, + "requests_completed": 115, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 706.9, + "tokens_out": 42401, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 42.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 720.6, + "tokens_out": 43232, + "tokens_in": 0, + "requests_completed": 120, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 41.5 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 697.1, + "tokens_out": 41830, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 43.9 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.2, + "tokens_out": 43597, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 35.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.9, + "tokens_out": 42083, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.3, + "tokens_out": 43715, + "tokens_in": 0, + "requests_completed": 121, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 43.1 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 688.9, + "tokens_out": 41331, + "tokens_in": 0, + "requests_completed": 119, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 717.5, + "tokens_out": 43059, + "tokens_in": 0, + "requests_completed": 119, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.7 + } + ], + "sustained_throughput_tokens_per_sec": 706.9, + "throttle_ratio": 0.899, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -360.1 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "06:26:58", + "run_id": "ffd81462", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T06:11:50.049074+00:00", + "benchmark_end_time": "2026-05-18T06:26:58.575027+00:00", + "benchmark_elapsed_minutes": 15.1, + "model_load_seconds": 41.2 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/env_info.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/env_info.json new file mode 100644 index 00000000..a73d4175 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/env_info.json @@ -0,0 +1,49 @@ +{ + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/result.json new file mode 100644 index 00000000..32d0a7b7 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/result.json @@ -0,0 +1,1519 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original", + "_note": "suite model_id. Each precision level uses its own quantized checkpoint." + }, + "task": { + "scenarios_run": [ + "accuracy", + "offline", + "online", + "sustained" + ], + "precision_levels_run": [ + "BF16", + "FP8", + "W8A8", + "W8A16", + "W4A16" + ], + "precision_levels_skipped": [ + "FP16" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "quantization": { + "results_by_precision": [ + { + "precision": "BF16", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "best_throughput_tokens_per_sec": 3888.91, + "accuracy_score": 0.56, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 2177.8, + "speedup_vs_bf16": 1.0, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3888.91, + "throughput_tokens_per_sec_per_chip": 3888.91, + "throughput_tokens_per_sec_total": 6956.79, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3886.06, + "throughput_tokens_per_sec_per_chip": 3886.06, + "throughput_tokens_per_sec_total": 6943.56, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3885.21, + "throughput_tokens_per_sec_per_chip": 3885.21, + "throughput_tokens_per_sec_total": 6935.62, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3887.73, + "throughput_tokens_per_sec_per_chip": 3887.73, + "throughput_tokens_per_sec_total": 6949.35, + "elapsed_seconds_median": 9.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "bf16", + "effective_dtype": "bfloat16", + "quantization_method": null + }, + { + "precision": "FP8", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "best_throughput_tokens_per_sec": 4141.71, + "accuracy_score": 0.0, + "accuracy_baseline_delta": -0.58, + "accuracy_valid": false, + "quality_efficiency": null, + "speedup_vs_bf16": 1.065, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 4141.71, + "throughput_tokens_per_sec_per_chip": 4141.71, + "throughput_tokens_per_sec_total": 6418.35, + "elapsed_seconds_median": 12.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 4130.72, + "throughput_tokens_per_sec_per_chip": 4130.72, + "throughput_tokens_per_sec_total": 6401.32, + "elapsed_seconds_median": 12.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 4124.42, + "throughput_tokens_per_sec_per_chip": 4124.42, + "throughput_tokens_per_sec_total": 6391.57, + "elapsed_seconds_median": 12.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 4131.44, + "throughput_tokens_per_sec_per_chip": 4131.44, + "throughput_tokens_per_sec_total": 6402.45, + "elapsed_seconds_median": 12.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "fp8", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W8A8", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "best_throughput_tokens_per_sec": 3208.11, + "accuracy_score": 0.59, + "accuracy_baseline_delta": 0.0, + "accuracy_valid": true, + "quality_efficiency": 1892.8, + "speedup_vs_bf16": 0.825, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3208.11, + "throughput_tokens_per_sec_per_chip": 3208.11, + "throughput_tokens_per_sec_total": 5840.36, + "elapsed_seconds_median": 10.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3140.16, + "throughput_tokens_per_sec_per_chip": 3140.16, + "throughput_tokens_per_sec_total": 5706.63, + "elapsed_seconds_median": 11.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3193.23, + "throughput_tokens_per_sec_per_chip": 3193.23, + "throughput_tokens_per_sec_total": 5813.28, + "elapsed_seconds_median": 10.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3175.58, + "throughput_tokens_per_sec_per_chip": 3175.58, + "throughput_tokens_per_sec_total": 5786.77, + "elapsed_seconds_median": 10.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w8a8", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W8A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "best_throughput_tokens_per_sec": 3547.44, + "accuracy_score": 0.58, + "accuracy_baseline_delta": -0.01, + "accuracy_valid": true, + "quality_efficiency": 2057.5, + "speedup_vs_bf16": 0.912, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3533.68, + "throughput_tokens_per_sec_per_chip": 3533.68, + "throughput_tokens_per_sec_total": 6328.84, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3510.7, + "throughput_tokens_per_sec_per_chip": 3510.7, + "throughput_tokens_per_sec_total": 6292.5, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3535.13, + "throughput_tokens_per_sec_per_chip": 3535.13, + "throughput_tokens_per_sec_total": 6324.07, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3547.44, + "throughput_tokens_per_sec_per_chip": 3547.44, + "throughput_tokens_per_sec_total": 6336.33, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w8a16", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors" + }, + { + "precision": "W4A16", + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "best_throughput_tokens_per_sec": 1889.19, + "accuracy_score": 0.56, + "accuracy_baseline_delta": -0.01, + "accuracy_valid": true, + "quality_efficiency": 1057.9, + "speedup_vs_bf16": 0.486, + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1889.19, + "throughput_tokens_per_sec_per_chip": 1889.19, + "throughput_tokens_per_sec_total": 3433.47, + "elapsed_seconds_median": 18.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1862.45, + "throughput_tokens_per_sec_per_chip": 1862.45, + "throughput_tokens_per_sec_total": 3376.95, + "elapsed_seconds_median": 18.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1861.34, + "throughput_tokens_per_sec_per_chip": 1861.34, + "throughput_tokens_per_sec_total": 3375.2, + "elapsed_seconds_median": 18.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1851.04, + "throughput_tokens_per_sec_per_chip": 1851.04, + "throughput_tokens_per_sec_total": 3367.3, + "elapsed_seconds_median": 18.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ], + "result_dir": "w4a16", + "effective_dtype": "float16", + "quantization_method": "gptq" + } + ] + }, + "derived": {}, + "quantization_online": { + "results_by_precision": [ + { + "precision": "BF16", + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 41.15, + "ttft_ms_p90": 61.6, + "ttft_ms_p99": 96.42, + "tpot_ms_p50": 13.32, + "tpot_ms_p90": 14.41, + "tpot_ms_p99": 14.81, + "elapsed_seconds_median": 68.9, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 52.06, + "ttft_ms_p90": 63.35, + "ttft_ms_p99": 69.96, + "tpot_ms_p50": 17.62, + "tpot_ms_p90": 18.95, + "tpot_ms_p99": 19.52, + "elapsed_seconds_median": 36.5, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 86.32, + "ttft_ms_p90": 5056.12, + "ttft_ms_p99": 5979.32, + "tpot_ms_p50": 22.47, + "tpot_ms_p90": 25.06, + "tpot_ms_p99": 26.76, + "elapsed_seconds_median": 26.1, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 891.06, + "ttft_ms_p90": 8527.64, + "ttft_ms_p99": 9213.57, + "tpot_ms_p50": 22.37, + "tpot_ms_p90": 24.86, + "tpot_ms_p99": 30.84, + "elapsed_seconds_median": 23.9, + "sla_met": false + } + ] + }, + { + "precision": "FP8", + "max_valid_qps": 5, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 53.12, + "ttft_ms_p90": 73.69, + "ttft_ms_p99": 115.17, + "tpot_ms_p50": 18.68, + "tpot_ms_p90": 20.12, + "tpot_ms_p99": 20.68, + "elapsed_seconds_median": 72.2, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 84.99, + "ttft_ms_p90": 1892.71, + "ttft_ms_p99": 3191.04, + "tpot_ms_p50": 26.19, + "tpot_ms_p90": 27.78, + "tpot_ms_p99": 28.06, + "elapsed_seconds_median": 43.0, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 6847.07, + "ttft_ms_p90": 15210.93, + "ttft_ms_p99": 16362.21, + "tpot_ms_p50": 25.97, + "tpot_ms_p90": 26.93, + "tpot_ms_p99": 27.06, + "elapsed_seconds_median": 38.8, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 10056.63, + "ttft_ms_p90": 20353.69, + "ttft_ms_p99": 21149.06, + "tpot_ms_p50": 25.43, + "tpot_ms_p90": 26.15, + "tpot_ms_p99": 26.2, + "elapsed_seconds_median": 37.3, + "sla_met": false + } + ] + }, + { + "precision": "W8A8", + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 55.34, + "ttft_ms_p90": 63.29, + "ttft_ms_p99": 69.75, + "tpot_ms_p50": 20.67, + "tpot_ms_p90": 20.88, + "tpot_ms_p99": 21.3, + "elapsed_seconds_median": 72.9, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 57.42, + "ttft_ms_p90": 66.6, + "ttft_ms_p99": 69.92, + "tpot_ms_p50": 21.28, + "tpot_ms_p90": 22.19, + "tpot_ms_p99": 22.28, + "elapsed_seconds_median": 39.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 74.55, + "ttft_ms_p90": 4438.81, + "ttft_ms_p99": 5421.82, + "tpot_ms_p50": 22.53, + "tpot_ms_p90": 23.69, + "tpot_ms_p99": 25.14, + "elapsed_seconds_median": 27.7, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 985.11, + "ttft_ms_p90": 8331.22, + "ttft_ms_p99": 8868.55, + "tpot_ms_p50": 23.38, + "tpot_ms_p90": 24.38, + "tpot_ms_p99": 26.79, + "elapsed_seconds_median": 25.6, + "sla_met": false + } + ] + }, + { + "precision": "W8A16", + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 46.32, + "ttft_ms_p90": 62.37, + "ttft_ms_p99": 104.49, + "tpot_ms_p50": 16.66, + "tpot_ms_p90": 17.7, + "tpot_ms_p99": 18.28, + "elapsed_seconds_median": 70.3, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 57.25, + "ttft_ms_p90": 72.45, + "ttft_ms_p99": 81.09, + "tpot_ms_p50": 20.79, + "tpot_ms_p90": 22.4, + "tpot_ms_p99": 23.2, + "elapsed_seconds_median": 37.9, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 93.4, + "ttft_ms_p90": 6429.1, + "ttft_ms_p99": 7429.34, + "tpot_ms_p50": 25.13, + "tpot_ms_p90": 28.01, + "tpot_ms_p99": 30.79, + "elapsed_seconds_median": 28.8, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 1306.21, + "ttft_ms_p90": 9937.77, + "ttft_ms_p99": 10640.43, + "tpot_ms_p50": 25.06, + "tpot_ms_p90": 27.42, + "tpot_ms_p99": 34.24, + "elapsed_seconds_median": 26.4, + "sla_met": false + } + ] + }, + { + "precision": "W4A16", + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 50.42, + "ttft_ms_p90": 64.62, + "ttft_ms_p99": 104.73, + "tpot_ms_p50": 18.15, + "tpot_ms_p90": 19.16, + "tpot_ms_p99": 19.62, + "elapsed_seconds_median": 71.1, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 57.89, + "ttft_ms_p90": 72.82, + "ttft_ms_p99": 84.96, + "tpot_ms_p50": 21.11, + "tpot_ms_p90": 23.03, + "tpot_ms_p99": 24.31, + "elapsed_seconds_median": 38.5, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 97.24, + "ttft_ms_p90": 6365.08, + "ttft_ms_p99": 7073.94, + "tpot_ms_p50": 25.46, + "tpot_ms_p90": 27.98, + "tpot_ms_p99": 31.21, + "elapsed_seconds_median": 29.2, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 918.12, + "ttft_ms_p90": 9805.4, + "ttft_ms_p99": 10437.5, + "tpot_ms_p50": 25.2, + "tpot_ms_p90": 27.67, + "tpot_ms_p99": 32.45, + "elapsed_seconds_median": 26.7, + "sla_met": false + } + ] + } + ] + }, + "quantization_sustained": { + "results_by_precision": [ + { + "precision": "BF16", + "sustained_throughput_tokens_per_sec": 706.9, + "throttle_ratio": 0.899, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -360.1, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 655.0, + "tokens_out": 39334, + "tokens_in": 0, + "requests_completed": 109, + "ttft_ms_p50": 44.4, + "ttft_ms_p99": 401.8 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 711.0, + "tokens_out": 42652, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 42.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 698.3, + "tokens_out": 41902, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 718.8, + "tokens_out": 43114, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 36.5 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 724.4, + "tokens_out": 43451, + "tokens_in": 0, + "requests_completed": 124, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 41.7 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.8, + "tokens_out": 42133, + "tokens_in": 0, + "requests_completed": 115, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.1 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 706.9, + "tokens_out": 42401, + "tokens_in": 0, + "requests_completed": 122, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 42.9 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 720.6, + "tokens_out": 43232, + "tokens_in": 0, + "requests_completed": 120, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 41.5 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 697.1, + "tokens_out": 41830, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 43.9 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 726.2, + "tokens_out": 43597, + "tokens_in": 0, + "requests_completed": 123, + "ttft_ms_p50": 34.4, + "ttft_ms_p99": 35.9 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 701.9, + "tokens_out": 42083, + "tokens_in": 0, + "requests_completed": 116, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 36.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 728.3, + "tokens_out": 43715, + "tokens_in": 0, + "requests_completed": 121, + "ttft_ms_p50": 34.5, + "ttft_ms_p99": 43.1 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 688.9, + "tokens_out": 41331, + "tokens_in": 0, + "requests_completed": 119, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.5 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 717.5, + "tokens_out": 43059, + "tokens_in": 0, + "requests_completed": 119, + "ttft_ms_p50": 34.6, + "ttft_ms_p99": 41.7 + } + ] + }, + { + "precision": "FP8", + "sustained_throughput_tokens_per_sec": 438.9, + "throttle_ratio": 0.856, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -644.7, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.6, + "tokens_out": 24576, + "tokens_in": 0, + "requests_completed": 48, + "ttft_ms_p50": 178.1, + "ttft_ms_p99": 701.6 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.6, + "tokens_out": 24576, + "tokens_in": 0, + "requests_completed": 48, + "ttft_ms_p50": 186.0, + "ttft_ms_p99": 236.5 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 477.8, + "tokens_out": 28672, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 51.0, + "ttft_ms_p99": 125.3 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.5, + "tokens_out": 24576, + "tokens_in": 0, + "requests_completed": 48, + "ttft_ms_p50": 50.3, + "ttft_ms_p99": 55.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 478.0, + "tokens_out": 28672, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 50.6, + "ttft_ms_p99": 54.6 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.3, + "tokens_out": 24576, + "tokens_in": 0, + "requests_completed": 48, + "ttft_ms_p50": 51.2, + "ttft_ms_p99": 57.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 478.2, + "tokens_out": 28672, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 50.5, + "ttft_ms_p99": 56.4 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.4, + "tokens_out": 24576, + "tokens_in": 0, + "requests_completed": 48, + "ttft_ms_p50": 51.3, + "ttft_ms_p99": 62.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 478.0, + "tokens_out": 28672, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 51.1, + "ttft_ms_p99": 59.5 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.4, + "tokens_out": 24576, + "tokens_in": 0, + "requests_completed": 48, + "ttft_ms_p50": 50.5, + "ttft_ms_p99": 55.7 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 478.2, + "tokens_out": 28672, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 51.4, + "ttft_ms_p99": 58.1 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.6, + "tokens_out": 24576, + "tokens_in": 0, + "requests_completed": 48, + "ttft_ms_p50": 51.6, + "ttft_ms_p99": 59.1 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 477.6, + "tokens_out": 28672, + "tokens_in": 0, + "requests_completed": 56, + "ttft_ms_p50": 51.4, + "ttft_ms_p99": 58.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.8, + "tokens_out": 24576, + "tokens_in": 0, + "requests_completed": 48, + "ttft_ms_p50": 51.3, + "ttft_ms_p99": 56.9 + } + ] + }, + { + "precision": "W8A8", + "sustained_throughput_tokens_per_sec": 399.4, + "throttle_ratio": 0.879, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -331.5, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 366.9, + "tokens_out": 22031, + "tokens_in": 0, + "requests_completed": 63, + "ttft_ms_p50": 59.1, + "ttft_ms_p99": 396.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 402.7, + "tokens_out": 24148, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 58.8, + "ttft_ms_p99": 63.4 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 400.8, + "tokens_out": 24050, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.5, + "ttft_ms_p99": 61.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 402.1, + "tokens_out": 24127, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.0, + "ttft_ms_p99": 61.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 395.3, + "tokens_out": 23722, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 58.7, + "ttft_ms_p99": 64.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 398.4, + "tokens_out": 23893, + "tokens_in": 0, + "requests_completed": 64, + "ttft_ms_p50": 58.7, + "ttft_ms_p99": 63.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 410.0, + "tokens_out": 24605, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.3, + "ttft_ms_p99": 62.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 398.6, + "tokens_out": 23918, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 59.5, + "ttft_ms_p99": 65.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 390.3, + "tokens_out": 23418, + "tokens_in": 0, + "requests_completed": 64, + "ttft_ms_p50": 58.3, + "ttft_ms_p99": 62.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 417.2, + "tokens_out": 25045, + "tokens_in": 0, + "requests_completed": 69, + "ttft_ms_p50": 58.2, + "ttft_ms_p99": 61.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 391.1, + "tokens_out": 23462, + "tokens_in": 0, + "requests_completed": 70, + "ttft_ms_p50": 57.8, + "ttft_ms_p99": 62.4 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 408.7, + "tokens_out": 24514, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.5, + "ttft_ms_p99": 65.6 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 415.2, + "tokens_out": 24925, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 58.2, + "ttft_ms_p99": 61.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 395.0, + "tokens_out": 23687, + "tokens_in": 0, + "requests_completed": 67, + "ttft_ms_p50": 58.0, + "ttft_ms_p99": 64.9 + } + ] + }, + { + "precision": "W8A16", + "sustained_throughput_tokens_per_sec": 494.1, + "throttle_ratio": 0.905, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -320.5, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 456.8, + "tokens_out": 27416, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 372.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 504.0, + "tokens_out": 30242, + "tokens_in": 0, + "requests_completed": 82, + "ttft_ms_p50": 47.1, + "ttft_ms_p99": 71.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 486.6, + "tokens_out": 29207, + "tokens_in": 0, + "requests_completed": 83, + "ttft_ms_p50": 47.1, + "ttft_ms_p99": 54.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 499.0, + "tokens_out": 29921, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 46.8, + "ttft_ms_p99": 52.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 496.0, + "tokens_out": 29768, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 46.9, + "ttft_ms_p99": 49.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 498.3, + "tokens_out": 29901, + "tokens_in": 0, + "requests_completed": 84, + "ttft_ms_p50": 47.0, + "ttft_ms_p99": 52.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 495.4, + "tokens_out": 29715, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 46.7, + "ttft_ms_p99": 50.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 496.1, + "tokens_out": 29779, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 46.9, + "ttft_ms_p99": 53.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 503.2, + "tokens_out": 30195, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 47.3, + "ttft_ms_p99": 54.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 489.6, + "tokens_out": 29369, + "tokens_in": 0, + "requests_completed": 83, + "ttft_ms_p50": 46.9, + "ttft_ms_p99": 52.3 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 504.8, + "tokens_out": 30299, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 46.8, + "ttft_ms_p99": 52.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 500.4, + "tokens_out": 30017, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 46.7, + "ttft_ms_p99": 50.1 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 494.6, + "tokens_out": 29670, + "tokens_in": 0, + "requests_completed": 84, + "ttft_ms_p50": 46.8, + "ttft_ms_p99": 51.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 492.1, + "tokens_out": 29528, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 47.0, + "ttft_ms_p99": 51.5 + } + ] + }, + { + "precision": "W4A16", + "sustained_throughput_tokens_per_sec": 437.3, + "throttle_ratio": 0.897, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -632.2, + "sustained_concurrency": 8, + "duration_minutes": 15, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.4, + "tokens_out": 24574, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 55.4, + "ttft_ms_p99": 690.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 431.5, + "tokens_out": 25899, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 74.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 456.2, + "tokens_out": 27364, + "tokens_in": 0, + "requests_completed": 77, + "ttft_ms_p50": 53.1, + "ttft_ms_p99": 56.8 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 439.7, + "tokens_out": 26374, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.2, + "ttft_ms_p99": 58.1 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 430.2, + "tokens_out": 25819, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.5, + "ttft_ms_p99": 56.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 432.7, + "tokens_out": 25962, + "tokens_in": 0, + "requests_completed": 74, + "ttft_ms_p50": 53.9, + "ttft_ms_p99": 57.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 452.2, + "tokens_out": 27140, + "tokens_in": 0, + "requests_completed": 77, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 57.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.1, + "tokens_out": 26169, + "tokens_in": 0, + "requests_completed": 74, + "ttft_ms_p50": 53.6, + "ttft_ms_p99": 57.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 432.3, + "tokens_out": 25934, + "tokens_in": 0, + "requests_completed": 76, + "ttft_ms_p50": 53.2, + "ttft_ms_p99": 57.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 431.9, + "tokens_out": 25908, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 56.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 449.0, + "tokens_out": 26936, + "tokens_in": 0, + "requests_completed": 77, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 57.9 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 445.3, + "tokens_out": 26739, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.1, + "ttft_ms_p99": 59.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 441.9, + "tokens_out": 26490, + "tokens_in": 0, + "requests_completed": 78, + "ttft_ms_p50": 53.3, + "ttft_ms_p99": 55.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 433.6, + "tokens_out": 26022, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 53.0, + "ttft_ms_p99": 57.9 + } + ] + } + ] + } + }, + "accuracy": null, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "06:01:31", + "run_id": "ffd81462", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T05:59:03.115858+00:00", + "benchmark_end_time": "2026-05-18T06:01:31.820089+00:00", + "benchmark_elapsed_minutes": 134.3, + "model_load_seconds": 35.2, + "benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).", + "scenario_dirs": { + "bf16/offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/offline", + "bf16/online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/online", + "bf16/sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/bf16/sustained", + "fp8/offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/fp8/offline", + "fp8/online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/fp8/online", + "fp8/sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/fp8/sustained", + "w8a8/offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/offline", + "w8a8/online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/online", + "w8a8/sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/sustained", + "w8a16/offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/offline", + "w8a16/online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/online", + "w8a16/sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/sustained", + "w4a16/offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/offline", + "w4a16/online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/online", + "w4a16/sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/sustained" + }, + "precision_dirs": { + "BF16": "bf16", + "FP8": "fp8", + "W8A8": "w8a8", + "W8A16": "w8a16", + "W4A16": "w4a16" + }, + "precision_model_map": { + "BF16": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "dtype_override": "bfloat16" + }, + "FP8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware." + }, + "W8A8": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores." + }, + "W8A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "engine_kwargs": { + "quantization": "compressed-tensors" + }, + "_note": "INT8 weights, FP16 activations. Weight-only quantization \u2014 reduces memory bandwidth, not compute dtype." + }, + "W4A16": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "engine_kwargs": { + "quantization": "gptq" + }, + "_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization \u2014 larger memory saving than W8A16." + } + } + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/accuracy/accuracy.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/accuracy/accuracy.json new file mode 100644 index 00000000..b3311bd9 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.56, + "baseline_delta": -0.01, + "valid": true, + "framework": "vLLM", + "precision": "W4A16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/offline/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/offline/result.json new file mode 100644 index 00000000..cc0424b3 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/offline/result.json @@ -0,0 +1,183 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "float16", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1889.19, + "throughput_tokens_per_sec_per_chip": 1889.19, + "throughput_tokens_per_sec_total": 3433.47, + "elapsed_seconds_median": 18.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1862.45, + "throughput_tokens_per_sec_per_chip": 1862.45, + "throughput_tokens_per_sec_total": 3376.95, + "elapsed_seconds_median": 18.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1861.34, + "throughput_tokens_per_sec_per_chip": 1861.34, + "throughput_tokens_per_sec_total": 3375.2, + "elapsed_seconds_median": 18.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1851.04, + "throughput_tokens_per_sec_per_chip": 1851.04, + "throughput_tokens_per_sec_total": 3367.3, + "elapsed_seconds_median": 18.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "14:44:34", + "run_id": "b1eb2d96", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_b1eb2d96", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T14:39:39.920688+00:00", + "benchmark_end_time": "2026-05-18T14:44:34.781477+00:00", + "benchmark_elapsed_minutes": 4.9, + "model_load_seconds": 18.5 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/online/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/online/result.json new file mode 100644 index 00000000..72619342 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/online/result.json @@ -0,0 +1,181 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": null, + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 50.42, + "ttft_ms_p90": 64.62, + "ttft_ms_p99": 104.73, + "tpot_ms_p50": 18.15, + "tpot_ms_p90": 19.16, + "tpot_ms_p99": 19.62, + "elapsed_seconds_median": 71.1, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 57.89, + "ttft_ms_p90": 72.82, + "ttft_ms_p99": 84.96, + "tpot_ms_p50": 21.11, + "tpot_ms_p90": 23.03, + "tpot_ms_p99": 24.31, + "elapsed_seconds_median": 38.5, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 97.24, + "ttft_ms_p90": 6365.08, + "ttft_ms_p99": 7073.94, + "tpot_ms_p50": 25.46, + "tpot_ms_p90": 27.98, + "tpot_ms_p99": 31.21, + "elapsed_seconds_median": 29.2, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 918.12, + "ttft_ms_p90": 9805.4, + "ttft_ms_p99": 10437.5, + "tpot_ms_p50": 25.2, + "tpot_ms_p90": 27.67, + "tpot_ms_p99": 32.45, + "elapsed_seconds_median": 26.7, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "14:53:48", + "run_id": "b1eb2d96", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_b1eb2d96", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T14:45:34.287656+00:00", + "benchmark_end_time": "2026-05-18T14:53:48.716951+00:00", + "benchmark_elapsed_minutes": 8.2, + "model_load_seconds": 29.3 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/result.json new file mode 100644 index 00000000..27e0744a --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/result.json @@ -0,0 +1,400 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": "float16", + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + } + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 1889.19, + "throughput_tokens_per_sec_per_chip": 1889.19, + "throughput_tokens_per_sec_total": 3433.47, + "elapsed_seconds_median": 18.2, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1862.45, + "throughput_tokens_per_sec_per_chip": 1862.45, + "throughput_tokens_per_sec_total": 3376.95, + "elapsed_seconds_median": 18.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1861.34, + "throughput_tokens_per_sec_per_chip": 1861.34, + "throughput_tokens_per_sec_total": 3375.2, + "elapsed_seconds_median": 18.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 1851.04, + "throughput_tokens_per_sec_per_chip": 1851.04, + "throughput_tokens_per_sec_total": 3367.3, + "elapsed_seconds_median": 18.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 50.42, + "ttft_ms_p90": 64.62, + "ttft_ms_p99": 104.73, + "tpot_ms_p50": 18.15, + "tpot_ms_p90": 19.16, + "tpot_ms_p99": 19.62, + "elapsed_seconds_median": 71.1, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 57.89, + "ttft_ms_p90": 72.82, + "ttft_ms_p99": 84.96, + "tpot_ms_p50": 21.11, + "tpot_ms_p90": 23.03, + "tpot_ms_p99": 24.31, + "elapsed_seconds_median": 38.5, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 97.24, + "ttft_ms_p90": 6365.08, + "ttft_ms_p99": 7073.94, + "tpot_ms_p50": 25.46, + "tpot_ms_p90": 27.98, + "tpot_ms_p99": 31.21, + "elapsed_seconds_median": 29.2, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 918.12, + "ttft_ms_p90": 9805.4, + "ttft_ms_p99": 10437.5, + "tpot_ms_p50": 25.2, + "tpot_ms_p90": 27.67, + "tpot_ms_p99": 32.45, + "elapsed_seconds_median": 26.7, + "sla_met": false + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.4, + "tokens_out": 24574, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 55.4, + "ttft_ms_p99": 690.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 431.5, + "tokens_out": 25899, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 74.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 456.2, + "tokens_out": 27364, + "tokens_in": 0, + "requests_completed": 77, + "ttft_ms_p50": 53.1, + "ttft_ms_p99": 56.8 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 439.7, + "tokens_out": 26374, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.2, + "ttft_ms_p99": 58.1 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 430.2, + "tokens_out": 25819, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.5, + "ttft_ms_p99": 56.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 432.7, + "tokens_out": 25962, + "tokens_in": 0, + "requests_completed": 74, + "ttft_ms_p50": 53.9, + "ttft_ms_p99": 57.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 452.2, + "tokens_out": 27140, + "tokens_in": 0, + "requests_completed": 77, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 57.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.1, + "tokens_out": 26169, + "tokens_in": 0, + "requests_completed": 74, + "ttft_ms_p50": 53.6, + "ttft_ms_p99": 57.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 432.3, + "tokens_out": 25934, + "tokens_in": 0, + "requests_completed": 76, + "ttft_ms_p50": 53.2, + "ttft_ms_p99": 57.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 431.9, + "tokens_out": 25908, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 56.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 449.0, + "tokens_out": 26936, + "tokens_in": 0, + "requests_completed": 77, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 57.9 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 445.3, + "tokens_out": 26739, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.1, + "ttft_ms_p99": 59.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 441.9, + "tokens_out": 26490, + "tokens_in": 0, + "requests_completed": 78, + "ttft_ms_p50": 53.3, + "ttft_ms_p99": 55.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 433.6, + "tokens_out": 26022, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 53.0, + "ttft_ms_p99": 57.9 + } + ], + "sustained_throughput_tokens_per_sec": 437.3, + "throttle_ratio": 0.897, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -632.2 + } + }, + "accuracy": { + "subset_score": 0.56, + "baseline_delta": -0.01, + "valid": true, + "framework": "vLLM", + "precision": "W4A16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "14:44:34", + "run_id": "b1eb2d96", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_b1eb2d96", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T14:39:39.920688+00:00", + "benchmark_end_time": "2026-05-18T14:44:34.781477+00:00", + "benchmark_elapsed_minutes": 28.3, + "model_load_seconds": 18.5, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/offline", + "online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/online", + "sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/sustained/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/sustained/result.json new file mode 100644 index 00000000..9982bb06 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w4a16/sustained/result.json @@ -0,0 +1,279 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", + "model_name": null, + "model_note": "INT4 weight-only quantization by RedHatAI using AWQ. Weights INT4, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W4A16", + "effective_dtype": null, + "quantization_method": "gptq", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 409.4, + "tokens_out": 24574, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 55.4, + "ttft_ms_p99": 690.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 431.5, + "tokens_out": 25899, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 74.1 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 456.2, + "tokens_out": 27364, + "tokens_in": 0, + "requests_completed": 77, + "ttft_ms_p50": 53.1, + "ttft_ms_p99": 56.8 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 439.7, + "tokens_out": 26374, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.2, + "ttft_ms_p99": 58.1 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 430.2, + "tokens_out": 25819, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.5, + "ttft_ms_p99": 56.1 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 432.7, + "tokens_out": 25962, + "tokens_in": 0, + "requests_completed": 74, + "ttft_ms_p50": 53.9, + "ttft_ms_p99": 57.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 452.2, + "tokens_out": 27140, + "tokens_in": 0, + "requests_completed": 77, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 57.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 436.1, + "tokens_out": 26169, + "tokens_in": 0, + "requests_completed": 74, + "ttft_ms_p50": 53.6, + "ttft_ms_p99": 57.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 432.3, + "tokens_out": 25934, + "tokens_in": 0, + "requests_completed": 76, + "ttft_ms_p50": 53.2, + "ttft_ms_p99": 57.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 431.9, + "tokens_out": 25908, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 56.8 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 449.0, + "tokens_out": 26936, + "tokens_in": 0, + "requests_completed": 77, + "ttft_ms_p50": 53.4, + "ttft_ms_p99": 57.9 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 445.3, + "tokens_out": 26739, + "tokens_in": 0, + "requests_completed": 75, + "ttft_ms_p50": 53.1, + "ttft_ms_p99": 59.4 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 441.9, + "tokens_out": 26490, + "tokens_in": 0, + "requests_completed": 78, + "ttft_ms_p50": 53.3, + "ttft_ms_p99": 55.8 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 433.6, + "tokens_out": 26022, + "tokens_in": 0, + "requests_completed": 73, + "ttft_ms_p50": 53.0, + "ttft_ms_p99": 57.9 + } + ], + "sustained_throughput_tokens_per_sec": 437.3, + "throttle_ratio": 0.897, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -632.2 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "15:09:58", + "run_id": "b1eb2d96", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_b1eb2d96", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T14:54:46.184556+00:00", + "benchmark_end_time": "2026-05-18T15:09:58.042851+00:00", + "benchmark_elapsed_minutes": 15.2, + "model_load_seconds": 25.5 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/accuracy/accuracy.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/accuracy/accuracy.json new file mode 100644 index 00000000..a6505b13 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.58, + "baseline_delta": -0.01, + "valid": true, + "framework": "vLLM", + "precision": "W8A16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/offline/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/offline/result.json new file mode 100644 index 00000000..6cb0ddaa --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/offline/result.json @@ -0,0 +1,183 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3533.68, + "throughput_tokens_per_sec_per_chip": 3533.68, + "throughput_tokens_per_sec_total": 6328.84, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3510.7, + "throughput_tokens_per_sec_per_chip": 3510.7, + "throughput_tokens_per_sec_total": 6292.5, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3535.13, + "throughput_tokens_per_sec_per_chip": 3535.13, + "throughput_tokens_per_sec_total": 6324.07, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3547.44, + "throughput_tokens_per_sec_per_chip": 3547.44, + "throughput_tokens_per_sec_total": 6336.33, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "14:10:48", + "run_id": "5b72ecb7", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_5b72ecb7", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T14:08:05.948198+00:00", + "benchmark_end_time": "2026-05-18T14:10:48.869711+00:00", + "benchmark_elapsed_minutes": 2.7, + "model_load_seconds": 25.3 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/online/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/online/result.json new file mode 100644 index 00000000..8f59b189 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/online/result.json @@ -0,0 +1,181 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": null, + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 46.32, + "ttft_ms_p90": 62.37, + "ttft_ms_p99": 104.49, + "tpot_ms_p50": 16.66, + "tpot_ms_p90": 17.7, + "tpot_ms_p99": 18.28, + "elapsed_seconds_median": 70.3, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 57.25, + "ttft_ms_p90": 72.45, + "ttft_ms_p99": 81.09, + "tpot_ms_p50": 20.79, + "tpot_ms_p90": 22.4, + "tpot_ms_p99": 23.2, + "elapsed_seconds_median": 37.9, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 93.4, + "ttft_ms_p90": 6429.1, + "ttft_ms_p99": 7429.34, + "tpot_ms_p50": 25.13, + "tpot_ms_p90": 28.01, + "tpot_ms_p99": 30.79, + "elapsed_seconds_median": 28.8, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 1306.21, + "ttft_ms_p90": 9937.77, + "ttft_ms_p99": 10640.43, + "tpot_ms_p50": 25.06, + "tpot_ms_p90": 27.42, + "tpot_ms_p99": 34.24, + "elapsed_seconds_median": 26.4, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "14:20:00", + "run_id": "5b72ecb7", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_5b72ecb7", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T14:11:50.787859+00:00", + "benchmark_end_time": "2026-05-18T14:20:00.278335+00:00", + "benchmark_elapsed_minutes": 8.2, + "model_load_seconds": 34.3 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/result.json new file mode 100644 index 00000000..485a0fb3 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/result.json @@ -0,0 +1,400 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + } + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3533.68, + "throughput_tokens_per_sec_per_chip": 3533.68, + "throughput_tokens_per_sec_total": 6328.84, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3510.7, + "throughput_tokens_per_sec_per_chip": 3510.7, + "throughput_tokens_per_sec_total": 6292.5, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3535.13, + "throughput_tokens_per_sec_per_chip": 3535.13, + "throughput_tokens_per_sec_total": 6324.07, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3547.44, + "throughput_tokens_per_sec_per_chip": 3547.44, + "throughput_tokens_per_sec_total": 6336.33, + "elapsed_seconds_median": 10.1, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 46.32, + "ttft_ms_p90": 62.37, + "ttft_ms_p99": 104.49, + "tpot_ms_p50": 16.66, + "tpot_ms_p90": 17.7, + "tpot_ms_p99": 18.28, + "elapsed_seconds_median": 70.3, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 57.25, + "ttft_ms_p90": 72.45, + "ttft_ms_p99": 81.09, + "tpot_ms_p50": 20.79, + "tpot_ms_p90": 22.4, + "tpot_ms_p99": 23.2, + "elapsed_seconds_median": 37.9, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 93.4, + "ttft_ms_p90": 6429.1, + "ttft_ms_p99": 7429.34, + "tpot_ms_p50": 25.13, + "tpot_ms_p90": 28.01, + "tpot_ms_p99": 30.79, + "elapsed_seconds_median": 28.8, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 1306.21, + "ttft_ms_p90": 9937.77, + "ttft_ms_p99": 10640.43, + "tpot_ms_p50": 25.06, + "tpot_ms_p90": 27.42, + "tpot_ms_p99": 34.24, + "elapsed_seconds_median": 26.4, + "sla_met": false + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 456.8, + "tokens_out": 27416, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 372.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 504.0, + "tokens_out": 30242, + "tokens_in": 0, + "requests_completed": 82, + "ttft_ms_p50": 47.1, + "ttft_ms_p99": 71.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 486.6, + "tokens_out": 29207, + "tokens_in": 0, + "requests_completed": 83, + "ttft_ms_p50": 47.1, + "ttft_ms_p99": 54.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 499.0, + "tokens_out": 29921, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 46.8, + "ttft_ms_p99": 52.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 496.0, + "tokens_out": 29768, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 46.9, + "ttft_ms_p99": 49.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 498.3, + "tokens_out": 29901, + "tokens_in": 0, + "requests_completed": 84, + "ttft_ms_p50": 47.0, + "ttft_ms_p99": 52.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 495.4, + "tokens_out": 29715, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 46.7, + "ttft_ms_p99": 50.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 496.1, + "tokens_out": 29779, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 46.9, + "ttft_ms_p99": 53.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 503.2, + "tokens_out": 30195, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 47.3, + "ttft_ms_p99": 54.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 489.6, + "tokens_out": 29369, + "tokens_in": 0, + "requests_completed": 83, + "ttft_ms_p50": 46.9, + "ttft_ms_p99": 52.3 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 504.8, + "tokens_out": 30299, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 46.8, + "ttft_ms_p99": 52.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 500.4, + "tokens_out": 30017, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 46.7, + "ttft_ms_p99": 50.1 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 494.6, + "tokens_out": 29670, + "tokens_in": 0, + "requests_completed": 84, + "ttft_ms_p50": 46.8, + "ttft_ms_p99": 51.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 492.1, + "tokens_out": 29528, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 47.0, + "ttft_ms_p99": 51.5 + } + ], + "sustained_throughput_tokens_per_sec": 494.1, + "throttle_ratio": 0.905, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -320.5 + } + }, + "accuracy": { + "subset_score": 0.58, + "baseline_delta": -0.01, + "valid": true, + "framework": "vLLM", + "precision": "W8A16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "14:10:48", + "run_id": "5b72ecb7", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_5b72ecb7", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T14:08:05.948198+00:00", + "benchmark_end_time": "2026-05-18T14:10:48.869711+00:00", + "benchmark_elapsed_minutes": 26.1, + "model_load_seconds": 25.3, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/offline", + "online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/online", + "sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/sustained" + } + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/sustained/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/sustained/result.json new file mode 100644 index 00000000..0fa36d8f --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a16/sustained/result.json @@ -0,0 +1,279 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", + "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", + "model_name": null, + "model_note": "INT8 weight-only quantization by RedHatAI using llm-compressor. Weights INT8, activations FP16.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A16", + "effective_dtype": null, + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 456.8, + "tokens_out": 27416, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 50.0, + "ttft_ms_p99": 372.0 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 504.0, + "tokens_out": 30242, + "tokens_in": 0, + "requests_completed": 82, + "ttft_ms_p50": 47.1, + "ttft_ms_p99": 71.8 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 486.6, + "tokens_out": 29207, + "tokens_in": 0, + "requests_completed": 83, + "ttft_ms_p50": 47.1, + "ttft_ms_p99": 54.4 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 499.0, + "tokens_out": 29921, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 46.8, + "ttft_ms_p99": 52.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 496.0, + "tokens_out": 29768, + "tokens_in": 0, + "requests_completed": 79, + "ttft_ms_p50": 46.9, + "ttft_ms_p99": 49.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 498.3, + "tokens_out": 29901, + "tokens_in": 0, + "requests_completed": 84, + "ttft_ms_p50": 47.0, + "ttft_ms_p99": 52.3 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 495.4, + "tokens_out": 29715, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 46.7, + "ttft_ms_p99": 50.2 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 496.1, + "tokens_out": 29779, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 46.9, + "ttft_ms_p99": 53.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 503.2, + "tokens_out": 30195, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 47.3, + "ttft_ms_p99": 54.0 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 489.6, + "tokens_out": 29369, + "tokens_in": 0, + "requests_completed": 83, + "ttft_ms_p50": 46.9, + "ttft_ms_p99": 52.3 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 504.8, + "tokens_out": 30299, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 46.8, + "ttft_ms_p99": 52.2 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 500.4, + "tokens_out": 30017, + "tokens_in": 0, + "requests_completed": 85, + "ttft_ms_p50": 46.7, + "ttft_ms_p99": 50.1 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 494.6, + "tokens_out": 29670, + "tokens_in": 0, + "requests_completed": 84, + "ttft_ms_p50": 46.8, + "ttft_ms_p99": 51.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 492.1, + "tokens_out": 29528, + "tokens_in": 0, + "requests_completed": 81, + "ttft_ms_p50": 47.0, + "ttft_ms_p99": 51.5 + } + ], + "sustained_throughput_tokens_per_sec": 494.1, + "throttle_ratio": 0.905, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -320.5 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "14:36:58", + "run_id": "5b72ecb7", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_5b72ecb7", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T14:21:44.687162+00:00", + "benchmark_end_time": "2026-05-18T14:36:58.078243+00:00", + "benchmark_elapsed_minutes": 15.2, + "model_load_seconds": 75.8 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/accuracy/accuracy.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/accuracy/accuracy.json new file mode 100644 index 00000000..a4847b6d --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "W8A8", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/offline/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/offline/result.json new file mode 100644 index 00000000..01e7a3d7 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/offline/result.json @@ -0,0 +1,183 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "model_name": null, + "model_note": "INT8 quantized by RedHatAI using llm-compressor (compressed-tensors). Both weights and activations quantized to INT8.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3208.11, + "throughput_tokens_per_sec_per_chip": 3208.11, + "throughput_tokens_per_sec_total": 5840.36, + "elapsed_seconds_median": 10.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3140.16, + "throughput_tokens_per_sec_per_chip": 3140.16, + "throughput_tokens_per_sec_total": 5706.63, + "elapsed_seconds_median": 11.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3193.23, + "throughput_tokens_per_sec_per_chip": 3193.23, + "throughput_tokens_per_sec_total": 5813.28, + "elapsed_seconds_median": 10.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3175.58, + "throughput_tokens_per_sec_per_chip": 3175.58, + "throughput_tokens_per_sec_total": 5786.77, + "elapsed_seconds_median": 10.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "13:39:44", + "run_id": "1b79437b", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_1b79437b", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T13:36:50.535504+00:00", + "benchmark_end_time": "2026-05-18T13:39:44.822889+00:00", + "benchmark_elapsed_minutes": 2.9, + "model_load_seconds": 18.2 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/online/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/online/result.json new file mode 100644 index 00000000..5e5bd009 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/online/result.json @@ -0,0 +1,181 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "model_name": null, + "model_note": "INT8 quantized by RedHatAI using llm-compressor (compressed-tensors). Both weights and activations quantized to INT8.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A8", + "effective_dtype": null, + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 55.34, + "ttft_ms_p90": 63.29, + "ttft_ms_p99": 69.75, + "tpot_ms_p50": 20.67, + "tpot_ms_p90": 20.88, + "tpot_ms_p99": 21.3, + "elapsed_seconds_median": 72.9, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 57.42, + "ttft_ms_p90": 66.6, + "ttft_ms_p99": 69.92, + "tpot_ms_p50": 21.28, + "tpot_ms_p90": 22.19, + "tpot_ms_p99": 22.28, + "elapsed_seconds_median": 39.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 74.55, + "ttft_ms_p90": 4438.81, + "ttft_ms_p99": 5421.82, + "tpot_ms_p50": 22.53, + "tpot_ms_p90": 23.69, + "tpot_ms_p99": 25.14, + "elapsed_seconds_median": 27.7, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 985.11, + "ttft_ms_p90": 8331.22, + "ttft_ms_p99": 8868.55, + "tpot_ms_p50": 23.38, + "tpot_ms_p90": 24.38, + "tpot_ms_p99": 26.79, + "elapsed_seconds_median": 25.6, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "13:48:53", + "run_id": "1b79437b", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_1b79437b", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T13:40:36.652448+00:00", + "benchmark_end_time": "2026-05-18T13:48:53.173908+00:00", + "benchmark_elapsed_minutes": 8.3, + "model_load_seconds": 23.6 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/result.json new file mode 100644 index 00000000..ff6e59c1 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/result.json @@ -0,0 +1,400 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "model_name": null, + "model_note": "INT8 quantized by RedHatAI using llm-compressor (compressed-tensors). Both weights and activations quantized to INT8.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + } + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 3208.11, + "throughput_tokens_per_sec_per_chip": 3208.11, + "throughput_tokens_per_sec_total": 5840.36, + "elapsed_seconds_median": 10.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 3140.16, + "throughput_tokens_per_sec_per_chip": 3140.16, + "throughput_tokens_per_sec_total": 5706.63, + "elapsed_seconds_median": 11.0, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 3193.23, + "throughput_tokens_per_sec_per_chip": 3193.23, + "throughput_tokens_per_sec_total": 5813.28, + "elapsed_seconds_median": 10.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 3175.58, + "throughput_tokens_per_sec_per_chip": 3175.58, + "throughput_tokens_per_sec_total": 5786.77, + "elapsed_seconds_median": 10.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 10, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 55.34, + "ttft_ms_p90": 63.29, + "ttft_ms_p99": 69.75, + "tpot_ms_p50": 20.67, + "tpot_ms_p90": 20.88, + "tpot_ms_p99": 21.3, + "elapsed_seconds_median": 72.9, + "sla_met": true + }, + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 57.42, + "ttft_ms_p90": 66.6, + "ttft_ms_p99": 69.92, + "tpot_ms_p50": 21.28, + "tpot_ms_p90": 22.19, + "tpot_ms_p99": 22.28, + "elapsed_seconds_median": 39.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 74.55, + "ttft_ms_p90": 4438.81, + "ttft_ms_p99": 5421.82, + "tpot_ms_p50": 22.53, + "tpot_ms_p90": 23.69, + "tpot_ms_p99": 25.14, + "elapsed_seconds_median": 27.7, + "sla_met": false + }, + { + "target_qps": 50, + "achieved_qps": 50.0, + "ttft_ms_p50": 985.11, + "ttft_ms_p90": 8331.22, + "ttft_ms_p99": 8868.55, + "tpot_ms_p50": 23.38, + "tpot_ms_p90": 24.38, + "tpot_ms_p99": 26.79, + "elapsed_seconds_median": 25.6, + "sla_met": false + } + ] + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 366.9, + "tokens_out": 22031, + "tokens_in": 0, + "requests_completed": 63, + "ttft_ms_p50": 59.1, + "ttft_ms_p99": 396.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 402.7, + "tokens_out": 24148, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 58.8, + "ttft_ms_p99": 63.4 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 400.8, + "tokens_out": 24050, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.5, + "ttft_ms_p99": 61.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 402.1, + "tokens_out": 24127, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.0, + "ttft_ms_p99": 61.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 395.3, + "tokens_out": 23722, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 58.7, + "ttft_ms_p99": 64.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 398.4, + "tokens_out": 23893, + "tokens_in": 0, + "requests_completed": 64, + "ttft_ms_p50": 58.7, + "ttft_ms_p99": 63.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 410.0, + "tokens_out": 24605, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.3, + "ttft_ms_p99": 62.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 398.6, + "tokens_out": 23918, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 59.5, + "ttft_ms_p99": 65.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 390.3, + "tokens_out": 23418, + "tokens_in": 0, + "requests_completed": 64, + "ttft_ms_p50": 58.3, + "ttft_ms_p99": 62.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 417.2, + "tokens_out": 25045, + "tokens_in": 0, + "requests_completed": 69, + "ttft_ms_p50": 58.2, + "ttft_ms_p99": 61.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 391.1, + "tokens_out": 23462, + "tokens_in": 0, + "requests_completed": 70, + "ttft_ms_p50": 57.8, + "ttft_ms_p99": 62.4 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 408.7, + "tokens_out": 24514, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.5, + "ttft_ms_p99": 65.6 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 415.2, + "tokens_out": 24925, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 58.2, + "ttft_ms_p99": 61.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 395.0, + "tokens_out": 23687, + "tokens_in": 0, + "requests_completed": 67, + "ttft_ms_p50": 58.0, + "ttft_ms_p99": 64.9 + } + ], + "sustained_throughput_tokens_per_sec": 399.4, + "throttle_ratio": 0.879, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -331.5 + } + }, + "accuracy": { + "subset_score": 0.59, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "W8A8", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "13:39:44", + "run_id": "1b79437b", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_1b79437b", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T13:36:50.535504+00:00", + "benchmark_end_time": "2026-05-18T13:39:44.822889+00:00", + "benchmark_elapsed_minutes": 26.5, + "model_load_seconds": 18.2, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/offline", + "online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/online", + "sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/sustained" + } + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/sustained/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/sustained/result.json new file mode 100644 index 00000000..eaea3a87 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_ffd81462/w8a8/sustained/result.json @@ -0,0 +1,279 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_C", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T05:56:25.789998+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", + "model_name": null, + "model_note": "INT8 quantized by RedHatAI using llm-compressor (compressed-tensors). Both weights and activations quantized to INT8.", + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "W8A8", + "effective_dtype": null, + "quantization_method": "compressed-tensors", + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": true, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 366.9, + "tokens_out": 22031, + "tokens_in": 0, + "requests_completed": 63, + "ttft_ms_p50": 59.1, + "ttft_ms_p99": 396.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 402.7, + "tokens_out": 24148, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 58.8, + "ttft_ms_p99": 63.4 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 400.8, + "tokens_out": 24050, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.5, + "ttft_ms_p99": 61.0 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 402.1, + "tokens_out": 24127, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.0, + "ttft_ms_p99": 61.2 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 395.3, + "tokens_out": 23722, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 58.7, + "ttft_ms_p99": 64.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 398.4, + "tokens_out": 23893, + "tokens_in": 0, + "requests_completed": 64, + "ttft_ms_p50": 58.7, + "ttft_ms_p99": 63.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 410.0, + "tokens_out": 24605, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.3, + "ttft_ms_p99": 62.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 398.6, + "tokens_out": 23918, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 59.5, + "ttft_ms_p99": 65.7 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 390.3, + "tokens_out": 23418, + "tokens_in": 0, + "requests_completed": 64, + "ttft_ms_p50": 58.3, + "ttft_ms_p99": 62.8 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 417.2, + "tokens_out": 25045, + "tokens_in": 0, + "requests_completed": 69, + "ttft_ms_p50": 58.2, + "ttft_ms_p99": 61.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 391.1, + "tokens_out": 23462, + "tokens_in": 0, + "requests_completed": 70, + "ttft_ms_p50": 57.8, + "ttft_ms_p99": 62.4 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 408.7, + "tokens_out": 24514, + "tokens_in": 0, + "requests_completed": 66, + "ttft_ms_p50": 58.5, + "ttft_ms_p99": 65.6 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 415.2, + "tokens_out": 24925, + "tokens_in": 0, + "requests_completed": 71, + "ttft_ms_p50": 58.2, + "ttft_ms_p99": 61.3 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 395.0, + "tokens_out": 23687, + "tokens_in": 0, + "requests_completed": 67, + "ttft_ms_p50": 58.0, + "ttft_ms_p99": 64.9 + } + ], + "sustained_throughput_tokens_per_sec": 399.4, + "throttle_ratio": 0.879, + "throttle_onset_minute": 1.0, + "ttft_p99_drift_ms": -331.5 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "14:04:58", + "run_id": "1b79437b", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_C_nvidia_vllm020_0f6c56e4_1b79437b", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T13:49:43.407380+00:00", + "benchmark_end_time": "2026-05-18T14:04:58.852879+00:00", + "benchmark_elapsed_minutes": 15.3, + "model_load_seconds": 22.2 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/accuracy/accuracy.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/accuracy/accuracy.json new file mode 100644 index 00000000..95fced50 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.56, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/env_info.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/env_info.json new file mode 100644 index 00000000..327ddbe4 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/env_info.json @@ -0,0 +1,49 @@ +{ + "collected_at": "2026-05-18T07:00:53.162228+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/interactive/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/interactive/result.json new file mode 100644 index 00000000..40a91c48 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/interactive/result.json @@ -0,0 +1,132 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T07:00:53.162228+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 3178.55, + "ttft_ms_p90": 3335.27, + "ttft_ms_p99": 3376.37, + "tpot_ms_p50": 13.1, + "tpot_ms_p90": 13.17, + "tpot_ms_p99": 13.2, + "peak_memory_gb": null, + "elapsed_seconds_median": 651.9 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "07:46:17", + "run_id": "43e96189", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T07:24:33.202740+00:00", + "benchmark_end_time": "2026-05-18T07:46:17.004385+00:00", + "benchmark_elapsed_minutes": 21.7, + "model_load_seconds": 65.2 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/offline/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/offline/result.json new file mode 100644 index 00000000..0778b9c8 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/offline/result.json @@ -0,0 +1,152 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T07:00:53.162228+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 65.15, + "throughput_tokens_per_sec_per_chip": 65.15, + "throughput_tokens_per_sec_total": 7353.82, + "elapsed_seconds_median": 196.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 65.12, + "throughput_tokens_per_sec_per_chip": 65.12, + "throughput_tokens_per_sec_total": 7349.93, + "elapsed_seconds_median": 196.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "07:23:02", + "run_id": "43e96189", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T07:03:23.326807+00:00", + "benchmark_end_time": "2026-05-18T07:23:02.065079+00:00", + "benchmark_elapsed_minutes": 19.6, + "model_load_seconds": 41.7 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/online/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/online/result.json new file mode 100644 index 00000000..fd6b20e6 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/online/result.json @@ -0,0 +1,169 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T07:00:53.162228+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 64, + "gpu_memory_utilization": 0.85 + }, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 129241.71, + "ttft_ms_p90": 238515.99, + "ttft_ms_p99": 255266.98, + "tpot_ms_p50": 231.96, + "tpot_ms_p90": 236.23, + "tpot_ms_p99": 238.5, + "elapsed_seconds_median": 459.2, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 163924.47, + "ttft_ms_p90": 304663.59, + "ttft_ms_p99": 340432.59, + "tpot_ms_p50": 232.21, + "tpot_ms_p90": 236.44, + "tpot_ms_p99": 238.73, + "elapsed_seconds_median": 461.6, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 197613.68, + "ttft_ms_p90": 361816.51, + "ttft_ms_p99": 400408.53, + "tpot_ms_p50": 232.17, + "tpot_ms_p90": 236.5, + "tpot_ms_p99": 238.76, + "elapsed_seconds_median": 459.2, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "10:05:01", + "run_id": "43e96189", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T09:19:01.620038+00:00", + "benchmark_end_time": "2026-05-18T10:05:01.503808+00:00", + "benchmark_elapsed_minutes": 46.0, + "model_load_seconds": 51.9 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/result.json new file mode 100644 index 00000000..ac745b18 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/result.json @@ -0,0 +1,519 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T07:00:53.162228+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "interactive", + "sustained", + "online" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 2, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 1, + "throughput_tokens_per_sec": 65.15, + "throughput_tokens_per_sec_per_chip": 65.15, + "throughput_tokens_per_sec_total": 7353.82, + "elapsed_seconds_median": 196.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 65.12, + "throughput_tokens_per_sec_per_chip": 65.12, + "throughput_tokens_per_sec_total": 7349.93, + "elapsed_seconds_median": 196.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "interactive": { + "ttft_ms_p50": 3178.55, + "ttft_ms_p90": 3335.27, + "ttft_ms_p99": 3376.37, + "tpot_ms_p50": 13.1, + "tpot_ms_p90": 13.17, + "tpot_ms_p99": 13.2, + "peak_memory_gb": null, + "elapsed_seconds_median": 651.9 + }, + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 34.1, + "tokens_out": 2048, + "tokens_in": 0, + "requests_completed": 8, + "ttft_ms_p50": 15012.6, + "ttft_ms_p99": 27511.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.4, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 6118.2, + "ttft_ms_p99": 6481.6 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5323.4, + "ttft_ms_p99": 6114.9 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5619.9, + "ttft_ms_p99": 6149.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5341.4, + "ttft_ms_p99": 6110.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5932.5, + "ttft_ms_p99": 6440.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 4964.1, + "ttft_ms_p99": 5812.6 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.8, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5597.6, + "ttft_ms_p99": 6251.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5486.4, + "ttft_ms_p99": 6180.1 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5472.0, + "ttft_ms_p99": 6505.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5850.6, + "ttft_ms_p99": 6694.6 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5208.5, + "ttft_ms_p99": 5840.7 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5909.3, + "ttft_ms_p99": 6251.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5593.9, + "ttft_ms_p99": 6073.0 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5297.1, + "ttft_ms_p99": 6684.1 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5956.8, + "ttft_ms_p99": 6615.6 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.4, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5954.7, + "ttft_ms_p99": 6462.3 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5267.7, + "ttft_ms_p99": 6152.2 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5455.5, + "ttft_ms_p99": 5958.6 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5614.7, + "ttft_ms_p99": 6275.4 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5592.8, + "ttft_ms_p99": 6443.6 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5407.0, + "ttft_ms_p99": 6248.9 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5348.3, + "ttft_ms_p99": 5840.6 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.4, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5893.0, + "ttft_ms_p99": 6513.5 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.8, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 4939.8, + "ttft_ms_p99": 5825.9 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 4900.3, + "ttft_ms_p99": 6665.7 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5623.0, + "ttft_ms_p99": 6163.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5881.5, + "ttft_ms_p99": 6217.3 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 6084.9, + "ttft_ms_p99": 6683.6 + } + ], + "sustained_throughput_tokens_per_sec": 58.7, + "throttle_ratio": 0.866, + "throttle_onset_minute": 2.0, + "ttft_p99_drift_ms": 202.0 + }, + "online": { + "sla_ttft_ms": 5000, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 0.5, + "achieved_qps": 0.5, + "ttft_ms_p50": 129241.71, + "ttft_ms_p90": 238515.99, + "ttft_ms_p99": 255266.98, + "tpot_ms_p50": 231.96, + "tpot_ms_p90": 236.23, + "tpot_ms_p99": 238.5, + "elapsed_seconds_median": 459.2, + "sla_met": false + }, + { + "target_qps": 1, + "achieved_qps": 1.0, + "ttft_ms_p50": 163924.47, + "ttft_ms_p90": 304663.59, + "ttft_ms_p99": 340432.59, + "tpot_ms_p50": 232.21, + "tpot_ms_p90": 236.44, + "tpot_ms_p99": 238.73, + "elapsed_seconds_median": 461.6, + "sla_met": false + }, + { + "target_qps": 2, + "achieved_qps": 2.0, + "ttft_ms_p50": 197613.68, + "ttft_ms_p90": 361816.51, + "ttft_ms_p99": 400408.53, + "tpot_ms_p50": 232.17, + "tpot_ms_p90": 236.5, + "tpot_ms_p99": 238.76, + "elapsed_seconds_median": 459.2, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": 0.56, + "baseline_delta": 0.0, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "07:23:02", + "run_id": "43e96189", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T07:03:23.326807+00:00", + "benchmark_end_time": "2026-05-18T07:23:02.065079+00:00", + "benchmark_elapsed_minutes": 118.1, + "model_load_seconds": 41.7, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'interactive', 'sustained', 'online'] scenarios.", + "scenario_dirs": { + "offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/offline", + "interactive": "results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/interactive", + "sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/sustained", + "online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/online" + } + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/sustained/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/sustained/result.json new file mode 100644 index 00000000..097e0e91 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189/sustained/result.json @@ -0,0 +1,424 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_D", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T07:00:53.162228+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 2, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 8, + "duration_minutes": 30, + "warmup_minutes": 2, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": true, + "throughput_tokens_per_sec": 34.1, + "tokens_out": 2048, + "tokens_in": 0, + "requests_completed": 8, + "ttft_ms_p50": 15012.6, + "ttft_ms_p99": 27511.4 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.4, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 6118.2, + "ttft_ms_p99": 6481.6 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5323.4, + "ttft_ms_p99": 6114.9 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5619.9, + "ttft_ms_p99": 6149.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5341.4, + "ttft_ms_p99": 6110.4 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5932.5, + "ttft_ms_p99": 6440.4 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 4964.1, + "ttft_ms_p99": 5812.6 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.8, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5597.6, + "ttft_ms_p99": 6251.9 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5486.4, + "ttft_ms_p99": 6180.1 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5472.0, + "ttft_ms_p99": 6505.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5850.6, + "ttft_ms_p99": 6694.6 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5208.5, + "ttft_ms_p99": 5840.7 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5909.3, + "ttft_ms_p99": 6251.2 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5593.9, + "ttft_ms_p99": 6073.0 + }, + { + "minute": 15.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5297.1, + "ttft_ms_p99": 6684.1 + }, + { + "minute": 16.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5956.8, + "ttft_ms_p99": 6615.6 + }, + { + "minute": 17.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.4, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5954.7, + "ttft_ms_p99": 6462.3 + }, + { + "minute": 18.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5267.7, + "ttft_ms_p99": 6152.2 + }, + { + "minute": 19.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5455.5, + "ttft_ms_p99": 5958.6 + }, + { + "minute": 20.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5614.7, + "ttft_ms_p99": 6275.4 + }, + { + "minute": 21.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5592.8, + "ttft_ms_p99": 6443.6 + }, + { + "minute": 22.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5407.0, + "ttft_ms_p99": 6248.9 + }, + { + "minute": 23.0, + "is_warmup": false, + "throughput_tokens_per_sec": 64.0, + "tokens_out": 3840, + "tokens_in": 0, + "requests_completed": 15, + "ttft_ms_p50": 5348.3, + "ttft_ms_p99": 5840.6 + }, + { + "minute": 24.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.4, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 5893.0, + "ttft_ms_p99": 6513.5 + }, + { + "minute": 25.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.8, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 4939.8, + "ttft_ms_p99": 5825.9 + }, + { + "minute": 26.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 4900.3, + "ttft_ms_p99": 6665.7 + }, + { + "minute": 27.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5623.0, + "ttft_ms_p99": 6163.1 + }, + { + "minute": 28.0, + "is_warmup": false, + "throughput_tokens_per_sec": 59.7, + "tokens_out": 3584, + "tokens_in": 0, + "requests_completed": 14, + "ttft_ms_p50": 5881.5, + "ttft_ms_p99": 6217.3 + }, + { + "minute": 29.0, + "is_warmup": false, + "throughput_tokens_per_sec": 55.5, + "tokens_out": 3328, + "tokens_in": 0, + "requests_completed": 13, + "ttft_ms_p50": 6084.9, + "ttft_ms_p99": 6683.6 + } + ], + "sustained_throughput_tokens_per_sec": 58.7, + "throttle_ratio": 0.866, + "throttle_onset_minute": 2.0, + "ttft_p99_drift_ms": 202.0 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "08:18:12", + "run_id": "43e96189", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_D_nvidia_vllm020_0f6c56e4_43e96189", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T07:47:24.813895+00:00", + "benchmark_end_time": "2026-05-18T08:18:12.448165+00:00", + "benchmark_elapsed_minutes": 30.8, + "model_load_seconds": 40.8 + } +} diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/accuracy/accuracy.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/accuracy/accuracy.json new file mode 100644 index 00000000..21e4fec2 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.41, + "baseline_delta": 0.03, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/env_info.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/env_info.json new file mode 100644 index 00000000..538f8e4c --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/env_info.json @@ -0,0 +1,49 @@ +{ + "collected_at": "2026-05-18T10:05:28.924925+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/interactive/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/interactive/result.json new file mode 100644 index 00000000..f6765408 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/interactive/result.json @@ -0,0 +1,137 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T10:05:28.924925+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 128, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 11.24, + "ttft_ms_p90": 13.36, + "ttft_ms_p99": 14.74, + "tpot_ms_p50": 1.83, + "tpot_ms_p90": 1.83, + "tpot_ms_p99": 1.87, + "peak_memory_gb": null, + "elapsed_seconds_median": 59.1 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "10:14:20", + "run_id": "a4e6a6e4", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T10:11:23.809781+00:00", + "benchmark_end_time": "2026-05-18T10:14:20.932444+00:00", + "benchmark_elapsed_minutes": 3.0, + "model_load_seconds": 27.1 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/offline/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/offline/result.json new file mode 100644 index 00000000..8c532a40 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/offline/result.json @@ -0,0 +1,170 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T10:05:28.924925+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 128, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 22462.36, + "throughput_tokens_per_sec_per_chip": 22462.36, + "throughput_tokens_per_sec_total": 33497.61, + "elapsed_seconds_median": 1.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 22493.12, + "throughput_tokens_per_sec_per_chip": 22493.12, + "throughput_tokens_per_sec_total": 33569.32, + "elapsed_seconds_median": 1.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 22884.92, + "throughput_tokens_per_sec_per_chip": 22884.92, + "throughput_tokens_per_sec_total": 34039.23, + "elapsed_seconds_median": 1.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "10:07:40", + "run_id": "a4e6a6e4", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T10:07:16.851531+00:00", + "benchmark_end_time": "2026-05-18T10:07:40.157696+00:00", + "benchmark_elapsed_minutes": 0.4, + "model_load_seconds": 28.0 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/online/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/online/result.json new file mode 100644 index 00000000..7df48900 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/online/result.json @@ -0,0 +1,157 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T10:05:28.924925+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 128, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 10.04, + "ttft_ms_p90": 12.66, + "ttft_ms_p99": 18.27, + "tpot_ms_p50": 2.13, + "tpot_ms_p90": 2.19, + "tpot_ms_p99": 2.37, + "elapsed_seconds_median": 32.0, + "sla_met": true + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 11.96, + "ttft_ms_p90": 15.96, + "ttft_ms_p99": 19.93, + "tpot_ms_p50": 2.5, + "tpot_ms_p90": 2.65, + "tpot_ms_p99": 2.87, + "elapsed_seconds_median": 7.9, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "10:10:30", + "run_id": "a4e6a6e4", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T10:08:31.412469+00:00", + "benchmark_end_time": "2026-05-18T10:10:30.424981+00:00", + "benchmark_elapsed_minutes": 2.0, + "model_load_seconds": 24.1 + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/result.json new file mode 100644 index 00000000..2e7e0ce3 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/result.json @@ -0,0 +1,375 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T10:05:28.924925+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive", + "sustained" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 128, + "gpu_memory_utilization": 0.9 + } + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 22462.36, + "throughput_tokens_per_sec_per_chip": 22462.36, + "throughput_tokens_per_sec_total": 33497.61, + "elapsed_seconds_median": 1.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 22493.12, + "throughput_tokens_per_sec_per_chip": 22493.12, + "throughput_tokens_per_sec_total": 33569.32, + "elapsed_seconds_median": 1.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 22884.92, + "throughput_tokens_per_sec_per_chip": 22884.92, + "throughput_tokens_per_sec_total": 34039.23, + "elapsed_seconds_median": 1.9, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 10.04, + "ttft_ms_p90": 12.66, + "ttft_ms_p99": 18.27, + "tpot_ms_p50": 2.13, + "tpot_ms_p90": 2.19, + "tpot_ms_p99": 2.37, + "elapsed_seconds_median": 32.0, + "sla_met": true + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 11.96, + "ttft_ms_p90": 15.96, + "ttft_ms_p99": 19.93, + "tpot_ms_p50": 2.5, + "tpot_ms_p90": 2.65, + "tpot_ms_p99": 2.87, + "elapsed_seconds_median": 7.9, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 11.24, + "ttft_ms_p90": 13.36, + "ttft_ms_p99": 14.74, + "tpot_ms_p50": 1.83, + "tpot_ms_p90": 1.83, + "tpot_ms_p99": 1.87, + "peak_memory_gb": null, + "elapsed_seconds_median": 59.1 + }, + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11541.5, + "tokens_out": 692740, + "tokens_in": 0, + "requests_completed": 3291, + "ttft_ms_p50": 12.9, + "ttft_ms_p99": 36.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11672.6, + "tokens_out": 700107, + "tokens_in": 0, + "requests_completed": 3324, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 20.3 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11721.9, + "tokens_out": 703664, + "tokens_in": 0, + "requests_completed": 3337, + "ttft_ms_p50": 12.7, + "ttft_ms_p99": 19.1 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11526.8, + "tokens_out": 691780, + "tokens_in": 0, + "requests_completed": 3289, + "ttft_ms_p50": 13.3, + "ttft_ms_p99": 20.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11228.1, + "tokens_out": 673578, + "tokens_in": 0, + "requests_completed": 3190, + "ttft_ms_p50": 13.8, + "ttft_ms_p99": 21.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11380.9, + "tokens_out": 682977, + "tokens_in": 0, + "requests_completed": 3245, + "ttft_ms_p50": 13.8, + "ttft_ms_p99": 21.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11711.7, + "tokens_out": 702201, + "tokens_in": 0, + "requests_completed": 3331, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 20.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11643.5, + "tokens_out": 698683, + "tokens_in": 0, + "requests_completed": 3317, + "ttft_ms_p50": 12.7, + "ttft_ms_p99": 20.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11662.7, + "tokens_out": 700038, + "tokens_in": 0, + "requests_completed": 3323, + "ttft_ms_p50": 12.7, + "ttft_ms_p99": 20.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11612.6, + "tokens_out": 696555, + "tokens_in": 0, + "requests_completed": 3294, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 19.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11623.8, + "tokens_out": 697495, + "tokens_in": 0, + "requests_completed": 3317, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 19.7 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11639.3, + "tokens_out": 698222, + "tokens_in": 0, + "requests_completed": 3311, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 20.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11651.4, + "tokens_out": 699229, + "tokens_in": 0, + "requests_completed": 3321, + "ttft_ms_p50": 12.7, + "ttft_ms_p99": 20.7 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11450.2, + "tokens_out": 686752, + "tokens_in": 0, + "requests_completed": 3257, + "ttft_ms_p50": 13.6, + "ttft_ms_p99": 20.7 + } + ], + "sustained_throughput_tokens_per_sec": 11576.2, + "throttle_ratio": 0.958, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -15.4 + } + }, + "accuracy": { + "subset_score": 0.41, + "baseline_delta": 0.03, + "valid": true, + "framework": "vLLM", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "10:07:40", + "run_id": "a4e6a6e4", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T10:07:16.851531+00:00", + "benchmark_end_time": "2026-05-18T10:07:40.157696+00:00", + "benchmark_elapsed_minutes": 20.4, + "model_load_seconds": 28.0, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive', 'sustained'] scenarios.", + "scenario_dirs": { + "offline": "results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/offline", + "online": "results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/online", + "interactive": "results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/interactive", + "sustained": "results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/sustained" + } + } +} \ No newline at end of file diff --git a/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/sustained/result.json b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/sustained/result.json new file mode 100644 index 00000000..6851ff63 --- /dev/null +++ b/results/verified/nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4/sustained/result.json @@ -0,0 +1,279 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_vllm020_0f6c56e4", + "chip": { + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 80.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T10:05:28.924925+00:00", + "accelerators": [ + { + "index": 0, + "name": "NVIDIA A100-SXM4-80GB", + "vendor": "NVIDIA", + "memory_gb": 80.0, + "driver_version": "580.65.06", + "firmware_version": null, + "compute_capability": "8.0", + "supports_bf16": true + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tNIC0\tNIC1\tNIC2\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \tPXB\tSYS\tSYS\t0-63,128-191\t0\t\tN/A\nNIC0\tPXB\t X \tSYS\tSYS\t\t\t\t\nNIC1\tSYS\tSYS\t X \tPIX\t\t\t\t\nNIC2\tSYS\tSYS\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n\n", + "intra_node_interconnect": null, + "cpu": { + "model": "AMD EPYC 7742 64-Core Processor", + "physical_cores": 128, + "logical_cores": 255, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.7, + "pcie_generation": "PCIe Gen 4", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_2", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0", + "kernel_version": "5.15.0-60-generic", + "runtime_version": "CUDA 13.0", + "pytorch_version": "2.11.0+cu130" + }, + "software": { + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "driver_version": "580.65.06", + "runtime_version": "CUDA 13.0", + "os": "Ubuntu 22.04.4 LTS", + "python_version": "3.12.0" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "sustained", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 128, + "gpu_memory_utilization": 0.9 + }, + "runtime_metrics": null + }, + "metrics": { + "sustained": { + "sustained_concurrency": 32, + "duration_minutes": 15, + "warmup_minutes": 1, + "sample_interval_seconds": 60, + "samples": [ + { + "minute": 1.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11541.5, + "tokens_out": 692740, + "tokens_in": 0, + "requests_completed": 3291, + "ttft_ms_p50": 12.9, + "ttft_ms_p99": 36.1 + }, + { + "minute": 2.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11672.6, + "tokens_out": 700107, + "tokens_in": 0, + "requests_completed": 3324, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 20.3 + }, + { + "minute": 3.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11721.9, + "tokens_out": 703664, + "tokens_in": 0, + "requests_completed": 3337, + "ttft_ms_p50": 12.7, + "ttft_ms_p99": 19.1 + }, + { + "minute": 4.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11526.8, + "tokens_out": 691780, + "tokens_in": 0, + "requests_completed": 3289, + "ttft_ms_p50": 13.3, + "ttft_ms_p99": 20.6 + }, + { + "minute": 5.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11228.1, + "tokens_out": 673578, + "tokens_in": 0, + "requests_completed": 3190, + "ttft_ms_p50": 13.8, + "ttft_ms_p99": 21.5 + }, + { + "minute": 6.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11380.9, + "tokens_out": 682977, + "tokens_in": 0, + "requests_completed": 3245, + "ttft_ms_p50": 13.8, + "ttft_ms_p99": 21.0 + }, + { + "minute": 7.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11711.7, + "tokens_out": 702201, + "tokens_in": 0, + "requests_completed": 3331, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 20.1 + }, + { + "minute": 8.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11643.5, + "tokens_out": 698683, + "tokens_in": 0, + "requests_completed": 3317, + "ttft_ms_p50": 12.7, + "ttft_ms_p99": 20.3 + }, + { + "minute": 9.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11662.7, + "tokens_out": 700038, + "tokens_in": 0, + "requests_completed": 3323, + "ttft_ms_p50": 12.7, + "ttft_ms_p99": 20.2 + }, + { + "minute": 10.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11612.6, + "tokens_out": 696555, + "tokens_in": 0, + "requests_completed": 3294, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 19.0 + }, + { + "minute": 11.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11623.8, + "tokens_out": 697495, + "tokens_in": 0, + "requests_completed": 3317, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 19.7 + }, + { + "minute": 12.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11639.3, + "tokens_out": 698222, + "tokens_in": 0, + "requests_completed": 3311, + "ttft_ms_p50": 12.8, + "ttft_ms_p99": 20.2 + }, + { + "minute": 13.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11651.4, + "tokens_out": 699229, + "tokens_in": 0, + "requests_completed": 3321, + "ttft_ms_p50": 12.7, + "ttft_ms_p99": 20.7 + }, + { + "minute": 14.0, + "is_warmup": false, + "throughput_tokens_per_sec": 11450.2, + "tokens_out": 686752, + "tokens_in": 0, + "requests_completed": 3257, + "ttft_ms_p50": 13.6, + "ttft_ms_p99": 20.7 + } + ], + "sustained_throughput_tokens_per_sec": 11576.2, + "throttle_ratio": 0.958, + "throttle_onset_minute": null, + "ttft_p99_drift_ms": -15.4 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "10:30:10", + "run_id": "a4e6a6e4", + "run_name": "nvidia_a100_sxm4_80gbx1_suite_F_nvidia_vllm020_0f6c56e4_a4e6a6e4", + "flagged": null, + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T10:15:08.957236+00:00", + "benchmark_end_time": "2026-05-18T10:30:10.126252+00:00", + "benchmark_elapsed_minutes": 15.0, + "model_load_seconds": 21.3 + } +} \ No newline at end of file diff --git a/run.py b/run.py index 85849c39..d7b9dbad 100644 --- a/run.py +++ b/run.py @@ -128,8 +128,11 @@ def cmd_list(args) -> int: print(f" {meta.get('description', '')}") if supersedes_chain: print(f" Replaces: {supersedes_chain[0]}") + install_sh = RUNNERS_DIR / rid / "install.sh" req_path = RUNNERS_DIR / rid / "requirements.txt" - if req_path.exists(): + if install_sh.exists(): + print(f" Install: bash runners/{rid}/install.sh") + elif req_path.exists(): print(f" Install: pip install -r runners/{rid}/requirements.txt") print() diff --git a/runners/benchmark_runner.py b/runners/benchmark_runner.py index 30afb70c..5b0c2747 100644 --- a/runners/benchmark_runner.py +++ b/runners/benchmark_runner.py @@ -561,8 +561,19 @@ def _compute_implementation_id(self) -> str | None: unexpected path or from the base class directly). """ try: - # Get the path of the concrete subclass file (not benchmark_runner.py) - runner_file = Path(inspect.getfile(self.__class__)) + # Resolve runner.py path. Prefer the defining module's __file__ because + # torch may patch inspect.getfile() and break dynamic imports. + runner_file = None + mod = sys.modules.get(self.__class__.__module__) + if mod is not None: + mod_file = getattr(mod, "__file__", None) + if mod_file: + runner_file = Path(mod_file).resolve() + if runner_file is None or runner_file.name != "runner.py": + try: + runner_file = Path(inspect.getfile(self.__class__)).resolve() + except (TypeError, OSError): + return None # The runner must be inside a folder named {platform}_{name}_{hash8} folder = runner_file.parent diff --git a/runners/nvidia_vllm020_0f6c56e4/README.md b/runners/nvidia_vllm020_0f6c56e4/README.md new file mode 100644 index 00000000..2d1657f7 --- /dev/null +++ b/runners/nvidia_vllm020_0f6c56e4/README.md @@ -0,0 +1,167 @@ +# nvidia_vllm020_0f6c56e4 — NVIDIA vLLM Runner (0.20.x) + +AccelMark reference runner for NVIDIA GPUs running **vLLM 0.20.x**. + +Supersedes [`nvidia_vllm_47f5d58e`](../nvidia_vllm_47f5d58e/) (vLLM 0.7.3). Use the predecessor for CUDA 11.8 / legacy stacks; use this runner for Ampere+ datacenter GPUs with CUDA 12.8 or 13.0. + +## Supported suites + +| Suite | Description | Notes | +|-------|-------------|-------| +| Suite A | Single-chip, Llama-3-8B | Speculative and burst extra scenarios | +| Suite B | Multi-chip, Llama-3-70B | Requires 4× A100/H100 or equivalent | +| Suite C | Quantization, Llama-3.1-8B | **Requires `enforce_eager: true` in runner config** — see below | +| Suite D | Long context ~28K input | `max_model_len` 30,208 | +| Suite E | Multi-chip scaling, Llama-3-8B | NVLink recommended | +| Suite F | Consumer/edge, Qwen2.5-0.5B | Pre-Ampere: use predecessor + `--enforce-eager` | +| Suite G | MoE multi-chip, Mixtral-8x7B | ≥2× A100-80GB | + +## What changed vs nvidia_vllm_47f5d58e + +| Area | 0.7.3 (predecessor) | 0.20.x (this runner) | +|---|---|---| +| Default CUDA | 12.1 | **13.0** (12.8 via `PYTORCH_INDEX`) | +| PyTorch | 2.5.1 | **2.11** (pulled by vLLM) | +| Python | 3.10+ | **3.10–3.12** | +| Transformers | v4.57 | vLLM-pinned (see `result.json` version string) | +| FlashAttention | FA2 | FA4 (MLA prefill default on supported models) | +| Quantization | fp8, compressed-tensors, gptq_marlin | + **turboquant** | +| Model runner | V1 | V2 | + +Release notes: [v0.20.0](https://github.com/vllm-project/vllm/releases/tag/v0.20.0) · [v0.20.1](https://github.com/vllm-project/vllm/releases/tag/v0.20.1). + +## Installation + +### Prerequisites + +- NVIDIA GPU, compute capability ≥ 7.0 (Volta+; Ampere+ recommended) +- **CUDA 13.0** driver/runtime (default for this stack), or **CUDA 12.8** via PyTorch index below +- **Python 3.10, 3.11, or 3.12** (not 3.13+ until vLLM supports it) +- A clean virtualenv/conda env if upgrading from `vllm==0.7.3` (mixed installs break imports) + +### Recommended: `install.sh` + +From the AccelMark repo root: + +```bash +# Create and activate a fresh env (example) +conda create -n accel python=3.12 -y +conda activate accel + +# Default install (CUDA 13.0 wheels from vLLM) +bash runners/nvidia_vllm020_0f6c56e4/install.sh +``` + +CUDA **12.8** hosts must point pip at the cu128 PyTorch index: + +```bash +PYTORCH_INDEX=https://download.pytorch.org/whl/cu128 \ + bash runners/nvidia_vllm020_0f6c56e4/install.sh +``` + +`install.sh` reads versions from `requirements.txt` and installs in three stages (pip cannot resolve `vllm` and `mistral-common[image]` in one pass). **Do not** run `pip install -r requirements.txt` directly. + +### Verify + +```bash +python -c "import vllm, torch; print('vllm', vllm.__version__, 'torch', torch.__version__, 'cuda', torch.cuda.is_available())" +``` + +### Manual install (equivalent to `install.sh`) + +```bash +pip install mistral-common==1.11.2 +pip install vllm==0.20.1 # add --extra-index-url if using PYTORCH_INDEX above +pip install "numpy>=1.26.0,<2.0" jsonschema psutil tqdm nvidia-ml-py PyYAML +``` + +### Submitter profile and local models + +```bash +cp configs/submitter.yaml.example configs/submitter.yaml # set submitted_by +cp configs/models_local.yaml.example configs/models_local.yaml # optional local paths +``` + +## Usage + +```bash +python run.py --runner nvidia_vllm020_0f6c56e4 --suite suite_A +python run.py --runner nvidia_vllm020_0f6c56e4 --suite suite_B --tensor-parallel-size 4 +python run.py --runner nvidia_vllm020_0f6c56e4 --suite suite_C +``` + +Or invoke the runner directly: + +```bash +python runners/nvidia_vllm020_0f6c56e4/runner.py --suite suite_F --scenario offline +``` + +## Runner config + +```bash +cp configs/runner_configs/runner_nvidia_vllm020_0f6c56e4.yaml.example \ + configs/runner_configs/runner_nvidia_vllm020_0f6c56e4.yaml +``` + +Merge priority: CLI flags > suite-specific section > global defaults. + +### Suite C — quantization + +Copy `runner_nvidia_vllm020_0f6c56e4.yaml` from the example and keep the `suite_C` override: + +```yaml +suites: + suite_C: + enforce_eager: true +``` + +**`enforce_eager` (required for W8A8 / W8A16 on all GPUs):** vLLM 0.20 + CUDA graphs + `compressed-tensors` can yield repetitive garbage (`-addon-addon-…`) with normal-looking offline throughput. Suite C must set `enforce_eager: true` (or pass `--enforce-eager`). + +**FP8 on Ampere (A100 / A800 / RTX 30xx, compute capability < 8.9):** vLLM 0.20 does **not** run RedHatAI FP8 checkpoints correctly. The engine falls back to weight-only Marlin FP8 (`marlin_utils_fp8` warning in the log) and accuracy stays ~0 even with `enforce_eager: true`. This is a vLLM 0.20 limitation, not an AccelMark bug. On these GPUs, Suite C **W8A8 / W8A16 / BF16** are valid; for FP8 use **H100+** (sm ≥ 8.9) or the [`nvidia_vllm_47f5d58e`](../nvidia_vllm_47f5d58e/) runner on vLLM 0.7.3. + +### Optional `engine_kwargs` (0.20) + +```yaml +engine_kwargs: + attention_backend: FLASH_ATTN_4 + # compilation_config: + # cudagraph_mode: full_and_piecewise + # kv_cache_dtype: turboquant # experimental; suite C +``` + +See [vLLM EngineArgs](https://docs.vllm.ai/en/latest/api/vllm/engine/arg_utils.html). + +## Troubleshooting + +### Large-memory GPUs (H20, A100 80GB) — SIGFPE / silent crash + +Symptom: subprocess exits with `SIGFPE (return code -8)` after model load or on first batch. + +```bash +pip install --upgrade nvidia-cublas-cu13 +``` + +On CUDA 12.8 stacks use `nvidia-cublas-cu12` instead. Details: [predecessor README](../nvidia_vllm_47f5d58e/README.md#large-memory-gpus-h20-a100-80-gb-etc). + +### Pre-Ampere (V100, T4, RTX 20xx) + +This runner targets Ampere+ with CUDA 12.8/13.0. For Volta/Turing, use [`nvidia_vllm_47f5d58e`](../nvidia_vllm_47f5d58e/) with `--enforce-eager` (BF16→FP16 fallback, no CUDA graphs). See the predecessor README for Suite F / Suite A on V100. + +### Suite C accuracy ~0 but offline OK + +1. Confirm `configs/runner_configs/runner_nvidia_vllm020_0f6c56e4.yaml` exists and has `suites.suite_C.enforce_eager: true`. +2. Re-run accuracy with `--force` (or delete the format’s `accuracy/` folder). +3. If the log shows `Weight-only FP8 compression will be used leveraging the Marlin kernel` on an **A100**, FP8 will stay ~0 on vLLM 0.20 — use W8A8/W8A16 or H100+ for FP8 (see Suite C section above). + +## Hardware matrix + +Full GPU compatibility table: [`nvidia_vllm_47f5d58e/README.md`](../nvidia_vllm_47f5d58e/README.md#hardware-compatibility). + +## Files + +| File | Purpose | +|------|---------| +| `runner.py` | Runner implementation | +| `meta.json` | Runner metadata and suite support | +| `requirements.txt` | Pinned dependency list (source of truth) | +| `install.sh` | Staged pip install | diff --git a/runners/nvidia_vllm020_0f6c56e4/install.sh b/runners/nvidia_vllm020_0f6c56e4/install.sh new file mode 100644 index 00000000..e4e82924 --- /dev/null +++ b/runners/nvidia_vllm020_0f6c56e4/install.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Install dependencies from requirements.txt in three stages. +# pip cannot resolve vllm and mistral-common[image] in a single install pass. +set -euo pipefail + +RUNNER_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REQ="${RUNNER_DIR}/requirements.txt" +EXTRA=() +if [[ -n "${PYTORCH_INDEX:-}" ]]; then + EXTRA=(--extra-index-url "${PYTORCH_INDEX}") +fi + +line() { awk -v p="$1" '$0 ~ "^" p "[=<>]" { print; exit }' "${REQ}"; } + +echo "==> $(line mistral-common)" +pip install "$(line mistral-common)" + +echo "==> $(line vllm)" +pip install "$(line vllm)" "${EXTRA[@]}" + +TMP="$(mktemp)" +trap 'rm -f "${TMP}"' EXIT +awk '!/^#/ && NF && $0 !~ /^mistral-common/ && $0 !~ /^vllm/' "${REQ}" > "${TMP}" +echo "==> AccelMark utilities" +pip install -r "${TMP}" + +python -c "import vllm; print('OK — vllm', vllm.__version__)" diff --git a/runners/nvidia_vllm020_0f6c56e4/meta.json b/runners/nvidia_vllm020_0f6c56e4/meta.json new file mode 100644 index 00000000..fcf9f0c9 --- /dev/null +++ b/runners/nvidia_vllm020_0f6c56e4/meta.json @@ -0,0 +1,21 @@ +{ + "id": "nvidia_vllm020_0f6c56e4", + "platform": "nvidia", + "name": "vLLM 0.20 on NVIDIA", + "framework": "vLLM", + "submitted_by": "JuhaoLiang1997", + "description": "AccelMark reference runner for NVIDIA GPUs using vLLM 0.20.x. Supersedes nvidia_vllm_47f5d58e (vLLM 0.7.3). Supports suites A–G.", + "supersedes_chain": [], + "notes": "vLLM 0.20.x line: torch 2.11, CUDA 13.0 default. Adds turboquant backend. Suite C requires enforce_eager in runner config (see README).", + "created": "2026-05-15", + "hardware_label": null, + "suite_support": { + "A": "pending", + "B": "pending", + "C": "pending", + "D": "pending", + "E": "pending", + "F": "pending", + "G": "pending" + } +} diff --git a/runners/nvidia_vllm020_0f6c56e4/requirements.txt b/runners/nvidia_vllm020_0f6c56e4/requirements.txt new file mode 100644 index 00000000..b09fcee1 --- /dev/null +++ b/runners/nvidia_vllm020_0f6c56e4/requirements.txt @@ -0,0 +1,19 @@ +# AccelMark — NVIDIA vLLM 0.20.x dependencies +# +# Install: bash install.sh +# Do not: pip install -r requirements.txt (pip mistral-common[image] resolver bug) +# +# Python 3.10–3.12. Reference stack: torch 2.11 + vllm 0.20.1 + CUDA 13.0 +# CUDA 12.8: PYTORCH_INDEX=https://download.pytorch.org/whl/cu128 bash install.sh + +# --- vLLM stack (install.sh stages these; torch/transformers pulled by vllm) --- +mistral-common==1.11.2 +vllm==0.20.1 + +# --- AccelMark utilities --- +numpy>=1.26.0,<2.0 +jsonschema>=4.20.0 +psutil>=7.0.0 +tqdm>=4.66.0 +nvidia-ml-py>=13.0 +PyYAML>=6.0 diff --git a/runners/nvidia_vllm020_0f6c56e4/runner.py b/runners/nvidia_vllm020_0f6c56e4/runner.py new file mode 100644 index 00000000..8383dfaf --- /dev/null +++ b/runners/nvidia_vllm020_0f6c56e4/runner.py @@ -0,0 +1,378 @@ +""" +AccelMark — NVIDIA vLLM benchmark script (vLLM 0.20.x). + +Implements BenchmarkRunner for vLLM 0.20.x on NVIDIA GPUs. +All orchestration logic lives in runners/benchmark_runner.py. +""" + +import asyncio +import sys +import time +from pathlib import Path +from typing import Optional + +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(_REPO_ROOT)) + +import torch +from vllm import LLM, AsyncLLMEngine, SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from transformers import AutoTokenizer + +from runners.benchmark_runner import BenchmarkRunner, InferenceRequest +from loadgen.types import InferenceResult + + +import logging +logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING) +logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING) + + +class VLLMRunner(BenchmarkRunner): + """AccelMark benchmark runner using vLLM on NVIDIA GPUs.""" + + SUPPORTS_STREAMING = True + SUPPORTS_BATCHING = True + SUPPORTS_ONLINE = True + SUPPORTS_MULTI_CHIP = True + + SUPPORTED_PRECISIONS = ["bf16", "fp16", "fp32"] + SUPPORTED_QUANTIZATION_BACKENDS = [ + "fp8", + "compressed-tensors", + "gptq_marlin", + "turboquant", + ] + + def __init__(self): + self.llm: LLM = None + self.engine: AsyncLLMEngine = None + self.tokenizer: AutoTokenizer = None + self.sampling_params: SamplingParams = None + self._loop: asyncio.AbstractEventLoop = None + + def _get_chip_count(self) -> int: + """Return the number of available CUDA GPUs.""" + try: + import torch + n = torch.cuda.device_count() + return n if n > 0 else 1 + except Exception: + return 1 + + def _get_framework_name(self) -> str: + return "vLLM" + + def _get_framework_version(self) -> str: + vllm_v = "unknown" + try: + import vllm + vllm_v = vllm.__version__ + except Exception: + pass + + tfm_v = None + try: + import transformers + tfm_v = transformers.__version__ + except Exception: + pass + + if tfm_v: + return f"{vllm_v}+transformers-{tfm_v}" + return vllm_v + + def load_model(self, model_path: str, parallelism: dict) -> None: + """Load model — sync LLM for offline/accuracy, async engine for streaming.""" + tp_size = parallelism["tensor_parallel_size"] + pp_size = parallelism["pipeline_parallel_size"] + ep_size = parallelism.get("expert_parallel_size", 1) + assert pp_size <= 1, "Pipeline parallelism is not supported in VLLMRunner" + + max_tokens = parallelism["max_tokens"] + max_model_len = parallelism["max_model_len"] + use_async = parallelism["use_async"] + enforce_eager = getattr(self, "_enforce_eager", False) + + cfg = getattr(self, "_runner_config", {}) + max_num_seqs = cfg.get("max_num_seqs", 512) + gpu_memory_util = cfg.get("gpu_memory_utilization", 0.90) + extra_kwargs = dict(cfg.get("engine_kwargs") or {}) + + try: + import dataclasses + from vllm.engine.arg_utils import EngineArgs as _EngineArgs + _valid = {f.name for f in dataclasses.fields(_EngineArgs)} + _dropped = {k: v for k, v in extra_kwargs.items() if k not in _valid} + if _dropped: + print(f" Warning: engine_kwargs keys not supported by this " + f"vLLM version and will be ignored: {list(_dropped)}") + extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid} + except Exception: + pass + + effective_precision = getattr(self, "_effective_precision", "BF16").upper() + precision = getattr(self, "_precision", None) or effective_precision + + _dtype_override = getattr(self, "_precision_dtype_override", None) + _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {}) + + quantization = _prec_eng_kwargs.pop("quantization", None) + + _NATIVE_DTYPE_MAP = { + "BF16": "bfloat16", + "FP16": "float16", + "FP32": "float32", + } + dtype = _NATIVE_DTYPE_MAP.get(precision, "auto") + self._quantization_method = quantization + + if _dtype_override: + dtype = _dtype_override + + if _prec_eng_kwargs: + _prec_eng_kwargs.update(extra_kwargs) + extra_kwargs = _prec_eng_kwargs + + print(f"Loading model: precision={precision}, dtype={dtype}" + + (f", quantization_method={self._quantization_method}" + if self._quantization_method else "")) + + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=False + ) + + self.sampling_params = SamplingParams( + max_tokens=max_tokens, + temperature=0.0, + ) + + if not use_async: + llm_kwargs = dict( + model=model_path, + dtype=dtype, + tensor_parallel_size=tp_size, + trust_remote_code=False, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + gpu_memory_utilization=gpu_memory_util, + **extra_kwargs, + ) + if ep_size > 1: + llm_kwargs["enable_expert_parallel"] = True + llm_kwargs["tensor_parallel_size"] = tp_size + if quantization: + llm_kwargs["quantization"] = quantization + if max_model_len: + llm_kwargs["max_model_len"] = max_model_len + self.llm = LLM(**llm_kwargs) + else: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + engine_kwargs = dict( + model=model_path, + dtype=dtype, + tensor_parallel_size=tp_size, + trust_remote_code=False, + enforce_eager=enforce_eager, + gpu_memory_utilization=gpu_memory_util, + **extra_kwargs, + ) + if ep_size > 1: + engine_kwargs["enable_expert_parallel"] = True + if max_model_len: + engine_kwargs["max_model_len"] = max_model_len + engine_args = AsyncEngineArgs(**engine_kwargs) + self.engine = AsyncLLMEngine.from_engine_args(engine_args) + + def get_effective_dtype(self) -> Optional[str]: + try: + if self.llm is not None: + dtype = self.llm.llm_engine.model_config.dtype + return str(dtype).replace("torch.", "") + elif self.engine is not None: + dtype = self.engine.engine.model_config.dtype + return str(dtype).replace("torch.", "") + except Exception: + pass + return getattr(self, "_effective_dtype", None) + + def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]: + formatted = [self._format_prompt(r.prompt) for r in requests] + t_start = time.perf_counter() + outputs = self.llm.generate(formatted, self.sampling_params) + elapsed = time.perf_counter() - t_start + + self._last_accuracy_outputs = [o.outputs[0].text for o in outputs] + + results = [] + for output in outputs: + results.append(InferenceResult( + first_token_time_ms=None, + total_time_ms=elapsed * 1000, + output_tokens=len(output.outputs[0].token_ids), + input_tokens=len(output.prompt_token_ids), + success=True, + output_text=output.outputs[0].text, + )) + return results + + async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult: + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + t_start = time.perf_counter() + first_token_time_ms = None + output_tokens = 0 + output_text = "" + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + if ( + first_token_time_ms is None + and len(output.outputs[0].token_ids) > 0 + ): + first_token_time_ms = (time.perf_counter() - t_start) * 1000 + output_tokens = len(output.outputs[0].token_ids) + output_text = output.outputs[0].text + + total_time_ms = (time.perf_counter() - t_start) * 1000 + return InferenceResult( + first_token_time_ms=first_token_time_ms, + total_time_ms=total_time_ms, + output_tokens=output_tokens, + input_tokens=0, + success=True, + output_text=output_text, + ) + + async def inference_fn_token_stream(self, request: InferenceRequest): + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + prev_length = 0 + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + current_text = output.outputs[0].text + delta = current_text[prev_length:] + if delta: + yield delta + prev_length = len(current_text) + + def get_peak_memory_gb(self) -> float: + try: + return torch.cuda.max_memory_allocated() / (1024 ** 3) + except Exception: + return None + + def release_resources(self) -> None: + if self.llm is not None: + try: + del self.llm + except Exception: + pass + self.llm = None + + if self.engine is not None: + try: + if self._loop and not self._loop.is_closed(): + self._loop.run_until_complete(self.engine.shutdown()) + except Exception: + pass + try: + del self.engine + except Exception: + pass + self.engine = None + + try: + from vllm.distributed.parallel_state import cleanup_dist_env_and_memory + cleanup_dist_env_and_memory(shutdown_ray=False) + except Exception: + try: + from vllm.distributed.parallel_state import ( + destroy_model_parallel, destroy_distributed_environment, + ) + destroy_model_parallel() + destroy_distributed_environment() + except Exception: + pass + + try: + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + except Exception: + pass + + def parse_args(self): + args = super().parse_args() + cfg = self._runner_config + + import argparse + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument("--tensor-parallel-size", type=int, default=None, + dest="tensor_parallel_size") + parser.add_argument("--pipeline-parallel-size", type=int, default=None, + dest="pipeline_parallel_size") + parser.add_argument("--expert-parallel-size", type=int, default=None, + dest="expert_parallel_size") + parser.add_argument("--enforce-eager", action="store_true", default=False, + dest="enforce_eager") + extra, _ = parser.parse_known_args() + + tp_size, _tp_source = self._resolve_tensor_parallel_size( + extra.tensor_parallel_size + ) + + pp_size = (extra.pipeline_parallel_size + if extra.pipeline_parallel_size is not None + else cfg.get("pipeline_parallel_size", 1)) + ep_size = (extra.expert_parallel_size + if extra.expert_parallel_size is not None + else cfg.get("expert_parallel_size", 1)) + self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False) + + print(f" tensor_parallel_size = {tp_size} [{_tp_source}]") + if ep_size > 1: + print(f" expert_parallel_size = {ep_size} [cli/yaml]") + + if not self.SUPPORTS_MULTI_CHIP and tp_size * pp_size > 1: + print(f"Warning: {self.__class__.__name__} does not support multi-chip. " + f"Ignoring tensor_parallel_size={tp_size}, using 1.") + tp_size = 1 + pp_size = 1 + ep_size = 1 + + self._parallelism = { + "tensor_parallel_size": tp_size, + "pipeline_parallel_size": pp_size, + "expert_parallel_size": ep_size, + "data_parallel_size": 1, + } + self._chip_count = tp_size * pp_size + self._precision = getattr(args, "precision", None) + return args + + def get_extra_subprocess_args(self, args) -> list[str]: + extra = [ + "--tensor-parallel-size", + str(self._parallelism.get("tensor_parallel_size", 1)), + ] + if self._parallelism.get("pipeline_parallel_size", 1) > 1: + extra += ["--pipeline-parallel-size", + str(self._parallelism["pipeline_parallel_size"])] + if self._parallelism.get("expert_parallel_size", 1) > 1: + extra += ["--expert-parallel-size", + str(self._parallelism["expert_parallel_size"])] + if self._enforce_eager: + extra += ["--enforce-eager"] + return extra + + +if __name__ == "__main__": + VLLMRunner().main()