diff --git a/README.md b/README.md
index ea9e2b6..3007966 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
| Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
| Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
| Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — |
+| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ✓ | ⋯ | ⋯ | ⋯ | ⋯ | ✓ | — |
_Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._
diff --git a/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example
new file mode 100644
index 0000000..c18f98b
--- /dev/null
+++ b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example
@@ -0,0 +1,60 @@
+# AccelMark runner config — moorethreads_vllm_musa_f2f6f965 (vllm-musa on Moore Threads)
+#
+# Copy this file to runner_moorethreads_vllm_musa_f2f6f965.yaml (remove
+# .example suffix) and edit as needed for your hardware. The actual .yaml
+# is gitignored.
+#
+# These settings adapt the runner to your hardware environment. They are
+# recorded in result.json task.extra_config for transparency but are NOT
+# part of the benchmark identity (not hashed into run_id).
+#
+# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
+
+# ── Global defaults (apply to all suites) ─────────────────────────────────────
+
+# Tensor parallel size — number of Moore Threads GPUs to use (default: 1).
+# For multi-card runs make sure to export VLLM_WORKER_MULTIPROC_METHOD=spawn.
+tensor_parallel_size: 1
+
+# Disable Triton CUDA-graph / compilation. Set true if you hit Triton kernel
+# errors on first request (most common on S3000 / S80 paths).
+enforce_eager: false
+
+# Maximum number of sequences in a batch (default: 256).
+# Reduce on lower-memory cards: 128 on 24 GB cards, 64 on 16 GB cards.
+max_num_seqs: 256
+
+# Fraction of MUSA HBM reserved for the KV cache (default: 0.85). Reduce if
+# you hit OOM; the vLLM flag is named gpu_memory_utilization but applies to
+# MUSA HBM via torchada.
+gpu_memory_utilization: 0.85
+
+# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs().
+# Unknown keys are dropped automatically with a warning, so this is safe to
+# use across vLLM 0.10.x / 0.13.x.
+# engine_kwargs:
+# swap_space: 8
+# max_seq_len_to_capture: 4096
+
+# ── Suite-specific overrides ───────────────────────────────────────────────────
+
+suites:
+ suite_D:
+ # Long-context — reduce batch size and reserve more memory.
+ max_num_seqs: 32
+ gpu_memory_utilization: 0.80
+
+ suite_F:
+ max_num_seqs: 128
+
+# ── Speculative decoding (suite_A / suite_D extra scenario) ─────────────────
+# Uncomment to enable. vllm-musa accepts the same speculative_config dict as
+# upstream vLLM; the runner translates flat keys (speculative_model,
+# num_speculative_tokens, ...) into speculative_config automatically.
+#
+# suites:
+# suite_A:
+# engine_kwargs:
+# speculative_model: "meta-llama/Llama-3.2-1B-Instruct"
+# num_speculative_tokens: 4
+# speculative_draft_tensor_parallel_size: 1
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json
new file mode 100644
index 0000000..7242234
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+ "subset_score": 0.07,
+ "baseline_delta": -0.53,
+ "valid": false,
+ "framework": "vllm-musa",
+ "precision": "BF16",
+ "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json
new file mode 100644
index 0000000..4244ef7
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json
@@ -0,0 +1,48 @@
+{
+ "collected_at": "2026-05-18T09:21:31.092840+00:00",
+ "accelerators": [
+ {
+ "index": 0,
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "memory_gb": 48.0,
+ "driver_version": "2.7.0",
+ "firmware_version": null,
+ "supports_bf16": true
+ }
+ ],
+ "accelerator_platform": "moorethreads",
+ "accelerator_topology": null,
+ "intra_node_interconnect": null,
+ "cpu": {
+ "model": "Intel(R) Xeon(R) Gold 6430",
+ "physical_cores": 64,
+ "logical_cores": 128,
+ "numa_nodes": 2
+ },
+ "system_memory_gb": 1007.5,
+ "pcie_generation": "PCIe 16x/16x",
+ "cpu_accelerator_bandwidth_gbs": null,
+ "network_interfaces": [
+ {
+ "name": "mlx5_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_1",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_bond_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ }
+ ],
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8",
+ "kernel_version": "5.15.0-105-generic",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "pytorch_version": "2.2.0"
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json
new file mode 100644
index 0000000..a050fe4
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json
@@ -0,0 +1,164 @@
+{
+ "schema_version": "1.0",
+ "suite_id": "suite_A",
+ "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+ "chip": {
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "count": 1,
+ "memory_gb": 48.0,
+ "interconnect_intra_node": null,
+ "interconnect_inter_node": null
+ },
+ "environment": {
+ "collected_at": "2026-05-18T09:21:31.092840+00:00",
+ "accelerators": [
+ {
+ "index": 0,
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "memory_gb": 48.0,
+ "driver_version": "2.7.0",
+ "firmware_version": null,
+ "supports_bf16": true
+ }
+ ],
+ "accelerator_platform": "moorethreads",
+ "accelerator_topology": null,
+ "intra_node_interconnect": null,
+ "cpu": {
+ "model": "Intel(R) Xeon(R) Gold 6430",
+ "physical_cores": 64,
+ "logical_cores": 128,
+ "numa_nodes": 2
+ },
+ "system_memory_gb": 1007.5,
+ "pcie_generation": "PCIe 16x/16x",
+ "cpu_accelerator_bandwidth_gbs": null,
+ "network_interfaces": [
+ {
+ "name": "mlx5_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_1",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_bond_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ }
+ ],
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8",
+ "kernel_version": "5.15.0-105-generic",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "pytorch_version": "2.2.0"
+ },
+ "software": {
+ "framework": "vllm-musa",
+ "framework_version": "0.4.2",
+ "driver_version": "2.7.0",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8"
+ },
+ "model": {
+ "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+ "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+ "model_name": null,
+ "model_note": null,
+ "model_source": "local",
+ "architecture": "dense",
+ "parameter_count_b": 8.0,
+ "precision": "BF16",
+ "effective_dtype": "float16",
+ "quantization_method": null,
+ "model_format": "HuggingFace original"
+ },
+ "task": {
+ "scenario": "offline",
+ "num_runs": 3,
+ "warmup_runs": 1,
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "expert_parallel_size": 1,
+ "data_parallel_size": 1
+ },
+ "extra_config": null,
+ "runtime_metrics": null
+ },
+ "metrics": {
+ "offline": {
+ "results_by_concurrency": [
+ {
+ "client_concurrency": 8,
+ "throughput_tokens_per_sec": 332.62,
+ "throughput_tokens_per_sec_per_chip": 332.62,
+ "throughput_tokens_per_sec_total": 922.83,
+ "elapsed_seconds_median": 43.4,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ },
+ {
+ "client_concurrency": 32,
+ "throughput_tokens_per_sec": 331.64,
+ "throughput_tokens_per_sec_per_chip": 331.64,
+ "throughput_tokens_per_sec_total": 920.1,
+ "elapsed_seconds_median": 43.6,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ },
+ {
+ "client_concurrency": 128,
+ "throughput_tokens_per_sec": 331.76,
+ "throughput_tokens_per_sec_per_chip": 331.76,
+ "throughput_tokens_per_sec_total": 920.46,
+ "elapsed_seconds_median": 43.6,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ }
+ ]
+ }
+ },
+ "accuracy": {
+ "subset_score": null,
+ "baseline_delta": null,
+ "valid": false,
+ "notes": "Run --scenario accuracy to check model accuracy."
+ },
+ "meta": {
+ "submitted_by": "JuhaoLiang1997",
+ "submission_type": "individual",
+ "date": "2026-05-18",
+ "time": "17:34:52",
+ "run_id": "cabb7bd0",
+ "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+ "flagged": null,
+ "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+ "env_info_file": "../env_info.json",
+ "log_file": "run.log",
+ "samples_file": "samples.jsonl",
+ "notes": null,
+ "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00",
+ "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00",
+ "benchmark_elapsed_minutes": 8.7,
+ "model_load_seconds": 116.8
+ }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json
new file mode 100644
index 0000000..064d6b8
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json
@@ -0,0 +1,163 @@
+{
+ "schema_version": "1.0",
+ "suite_id": "suite_A",
+ "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+ "chip": {
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "count": 1,
+ "memory_gb": 48.0,
+ "interconnect_intra_node": null,
+ "interconnect_inter_node": null
+ },
+ "environment": {
+ "collected_at": "2026-05-18T09:21:31.092840+00:00",
+ "accelerators": [
+ {
+ "index": 0,
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "memory_gb": 48.0,
+ "driver_version": "2.7.0",
+ "firmware_version": null,
+ "supports_bf16": true
+ }
+ ],
+ "accelerator_platform": "moorethreads",
+ "accelerator_topology": null,
+ "intra_node_interconnect": null,
+ "cpu": {
+ "model": "Intel(R) Xeon(R) Gold 6430",
+ "physical_cores": 64,
+ "logical_cores": 128,
+ "numa_nodes": 2
+ },
+ "system_memory_gb": 1007.5,
+ "pcie_generation": "PCIe 16x/16x",
+ "cpu_accelerator_bandwidth_gbs": null,
+ "network_interfaces": [
+ {
+ "name": "mlx5_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_1",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_bond_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ }
+ ],
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8",
+ "kernel_version": "5.15.0-105-generic",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "pytorch_version": "2.2.0"
+ },
+ "software": {
+ "framework": "vllm-musa",
+ "framework_version": "0.4.2",
+ "driver_version": "2.7.0",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8"
+ },
+ "model": {
+ "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+ "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+ "model_name": null,
+ "model_note": null,
+ "model_source": "local",
+ "architecture": "dense",
+ "parameter_count_b": 8.0,
+ "precision": "BF16",
+ "effective_dtype": "float16",
+ "quantization_method": null,
+ "model_format": "HuggingFace original"
+ },
+ "task": {
+ "scenario": "online",
+ "num_runs": 3,
+ "warmup_runs": 1,
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "expert_parallel_size": 1,
+ "data_parallel_size": 1
+ },
+ "extra_config": null,
+ "runtime_metrics": null
+ },
+ "metrics": {
+ "online": {
+ "sla_ttft_ms": 500,
+ "max_valid_qps": 5,
+ "results_by_qps": [
+ {
+ "target_qps": 5,
+ "achieved_qps": 5.0,
+ "ttft_ms_p50": 194.45,
+ "ttft_ms_p90": 315.05,
+ "ttft_ms_p99": 424.55,
+ "tpot_ms_p50": 201.93,
+ "tpot_ms_p90": 253.8,
+ "tpot_ms_p99": 471.28,
+ "elapsed_seconds_median": 137.6,
+ "sla_met": true
+ },
+ {
+ "target_qps": 25,
+ "achieved_qps": 25.0,
+ "ttft_ms_p50": 4796.14,
+ "ttft_ms_p90": 8459.18,
+ "ttft_ms_p99": 9348.86,
+ "tpot_ms_p50": 355.01,
+ "tpot_ms_p90": 6430.04,
+ "tpot_ms_p99": 15579.83,
+ "elapsed_seconds_median": 93.0,
+ "sla_met": false
+ },
+ {
+ "target_qps": 100,
+ "achieved_qps": 100.0,
+ "ttft_ms_p50": 10354.27,
+ "ttft_ms_p90": 17651.16,
+ "ttft_ms_p99": 19078.89,
+ "tpot_ms_p50": 849.82,
+ "tpot_ms_p90": 8677.79,
+ "tpot_ms_p99": 14281.03,
+ "elapsed_seconds_median": 90.0,
+ "sla_met": false
+ }
+ ]
+ }
+ },
+ "accuracy": {
+ "subset_score": null,
+ "baseline_delta": null,
+ "valid": false,
+ "notes": "Run --scenario accuracy to check model accuracy."
+ },
+ "meta": {
+ "submitted_by": "JuhaoLiang1997",
+ "submission_type": "individual",
+ "date": "2026-05-18",
+ "time": "17:53:38",
+ "run_id": "cabb7bd0",
+ "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+ "flagged": null,
+ "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+ "env_info_file": "../env_info.json",
+ "log_file": "run.log",
+ "samples_file": "samples.jsonl",
+ "notes": null,
+ "benchmark_start_time": "2026-05-18T09:37:13.745117+00:00",
+ "benchmark_end_time": "2026-05-18T09:53:38.865501+00:00",
+ "benchmark_elapsed_minutes": 16.4,
+ "model_load_seconds": 122.7
+ }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json
new file mode 100644
index 0000000..e4b1093
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json
@@ -0,0 +1,215 @@
+{
+ "schema_version": "1.0",
+ "suite_id": "suite_A",
+ "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+ "chip": {
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "count": 1,
+ "memory_gb": 48.0,
+ "interconnect_intra_node": null,
+ "interconnect_inter_node": null
+ },
+ "environment": {
+ "collected_at": "2026-05-18T09:21:31.092840+00:00",
+ "accelerators": [
+ {
+ "index": 0,
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "memory_gb": 48.0,
+ "driver_version": "2.7.0",
+ "firmware_version": null,
+ "supports_bf16": true
+ }
+ ],
+ "accelerator_platform": "moorethreads",
+ "accelerator_topology": null,
+ "intra_node_interconnect": null,
+ "cpu": {
+ "model": "Intel(R) Xeon(R) Gold 6430",
+ "physical_cores": 64,
+ "logical_cores": 128,
+ "numa_nodes": 2
+ },
+ "system_memory_gb": 1007.5,
+ "pcie_generation": "PCIe 16x/16x",
+ "cpu_accelerator_bandwidth_gbs": null,
+ "network_interfaces": [
+ {
+ "name": "mlx5_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_1",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_bond_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ }
+ ],
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8",
+ "kernel_version": "5.15.0-105-generic",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "pytorch_version": "2.2.0"
+ },
+ "software": {
+ "framework": "vllm-musa",
+ "framework_version": "0.4.2",
+ "driver_version": "2.7.0",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8"
+ },
+ "model": {
+ "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+ "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+ "model_name": null,
+ "model_note": null,
+ "model_source": "local",
+ "architecture": "dense",
+ "parameter_count_b": 8.0,
+ "precision": "BF16",
+ "effective_dtype": "float16",
+ "quantization_method": null,
+ "model_format": "HuggingFace original"
+ },
+ "task": {
+ "scenarios_run": [
+ "offline",
+ "online"
+ ],
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "expert_parallel_size": 1,
+ "data_parallel_size": 1
+ },
+ "num_runs": 3,
+ "extra_config": null
+ },
+ "metrics": {
+ "derived": {},
+ "offline": {
+ "results_by_concurrency": [
+ {
+ "client_concurrency": 8,
+ "throughput_tokens_per_sec": 332.62,
+ "throughput_tokens_per_sec_per_chip": 332.62,
+ "throughput_tokens_per_sec_total": 922.83,
+ "elapsed_seconds_median": 43.4,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ },
+ {
+ "client_concurrency": 32,
+ "throughput_tokens_per_sec": 331.64,
+ "throughput_tokens_per_sec_per_chip": 331.64,
+ "throughput_tokens_per_sec_total": 920.1,
+ "elapsed_seconds_median": 43.6,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ },
+ {
+ "client_concurrency": 128,
+ "throughput_tokens_per_sec": 331.76,
+ "throughput_tokens_per_sec_per_chip": 331.76,
+ "throughput_tokens_per_sec_total": 920.46,
+ "elapsed_seconds_median": 43.6,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ }
+ ]
+ },
+ "online": {
+ "sla_ttft_ms": 500,
+ "max_valid_qps": 5,
+ "results_by_qps": [
+ {
+ "target_qps": 5,
+ "achieved_qps": 5.0,
+ "ttft_ms_p50": 194.45,
+ "ttft_ms_p90": 315.05,
+ "ttft_ms_p99": 424.55,
+ "tpot_ms_p50": 201.93,
+ "tpot_ms_p90": 253.8,
+ "tpot_ms_p99": 471.28,
+ "elapsed_seconds_median": 137.6,
+ "sla_met": true
+ },
+ {
+ "target_qps": 25,
+ "achieved_qps": 25.0,
+ "ttft_ms_p50": 4796.14,
+ "ttft_ms_p90": 8459.18,
+ "ttft_ms_p99": 9348.86,
+ "tpot_ms_p50": 355.01,
+ "tpot_ms_p90": 6430.04,
+ "tpot_ms_p99": 15579.83,
+ "elapsed_seconds_median": 93.0,
+ "sla_met": false
+ },
+ {
+ "target_qps": 100,
+ "achieved_qps": 100.0,
+ "ttft_ms_p50": 10354.27,
+ "ttft_ms_p90": 17651.16,
+ "ttft_ms_p99": 19078.89,
+ "tpot_ms_p50": 849.82,
+ "tpot_ms_p90": 8677.79,
+ "tpot_ms_p99": 14281.03,
+ "elapsed_seconds_median": 90.0,
+ "sla_met": false
+ }
+ ]
+ }
+ },
+ "accuracy": {
+ "subset_score": 0.07,
+ "baseline_delta": -0.53,
+ "valid": false,
+ "framework": "vllm-musa",
+ "precision": "BF16",
+ "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+ },
+ "meta": {
+ "submitted_by": "JuhaoLiang1997",
+ "submission_type": "individual",
+ "date": "2026-05-18",
+ "time": "17:34:52",
+ "run_id": "cabb7bd0",
+ "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+ "flagged": null,
+ "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+ "env_info_file": "../env_info.json",
+ "log_file": "run.log",
+ "samples_file": "samples.jsonl",
+ "notes": "Partial run: ['offline', 'online'] succeeded, ['accuracy'] failed.",
+ "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00",
+ "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00",
+ "benchmark_elapsed_minutes": 25.1,
+ "model_load_seconds": 116.8,
+ "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.",
+ "scenario_dirs": {
+ "offline": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline",
+ "online": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online"
+ }
+ }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json
new file mode 100644
index 0000000..63c6e92
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+ "subset_score": 0.07,
+ "baseline_delta": -0.31,
+ "valid": false,
+ "framework": "vllm-musa",
+ "precision": "BF16",
+ "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json
new file mode 100644
index 0000000..31f501b
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json
@@ -0,0 +1,48 @@
+{
+ "collected_at": "2026-05-18T08:40:55.208034+00:00",
+ "accelerators": [
+ {
+ "index": 0,
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "memory_gb": 48.0,
+ "driver_version": "2.7.0",
+ "firmware_version": null,
+ "supports_bf16": true
+ }
+ ],
+ "accelerator_platform": "moorethreads",
+ "accelerator_topology": null,
+ "intra_node_interconnect": null,
+ "cpu": {
+ "model": "Intel(R) Xeon(R) Gold 6430",
+ "physical_cores": 64,
+ "logical_cores": 128,
+ "numa_nodes": 2
+ },
+ "system_memory_gb": 1007.5,
+ "pcie_generation": "PCIe 16x/16x",
+ "cpu_accelerator_bandwidth_gbs": null,
+ "network_interfaces": [
+ {
+ "name": "mlx5_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_1",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_bond_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ }
+ ],
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8",
+ "kernel_version": "5.15.0-105-generic",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "pytorch_version": "2.2.0"
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json
new file mode 100644
index 0000000..4f5ff81
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json
@@ -0,0 +1,131 @@
+{
+ "schema_version": "1.0",
+ "suite_id": "suite_F",
+ "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+ "chip": {
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "count": 1,
+ "memory_gb": 48.0,
+ "interconnect_intra_node": null,
+ "interconnect_inter_node": null
+ },
+ "environment": {
+ "collected_at": "2026-05-18T08:40:55.208034+00:00",
+ "accelerators": [
+ {
+ "index": 0,
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "memory_gb": 48.0,
+ "driver_version": "2.7.0",
+ "firmware_version": null,
+ "supports_bf16": true
+ }
+ ],
+ "accelerator_platform": "moorethreads",
+ "accelerator_topology": null,
+ "intra_node_interconnect": null,
+ "cpu": {
+ "model": "Intel(R) Xeon(R) Gold 6430",
+ "physical_cores": 64,
+ "logical_cores": 128,
+ "numa_nodes": 2
+ },
+ "system_memory_gb": 1007.5,
+ "pcie_generation": "PCIe 16x/16x",
+ "cpu_accelerator_bandwidth_gbs": null,
+ "network_interfaces": [
+ {
+ "name": "mlx5_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_1",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_bond_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ }
+ ],
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8",
+ "kernel_version": "5.15.0-105-generic",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "pytorch_version": "2.2.0"
+ },
+ "software": {
+ "framework": "vllm-musa",
+ "framework_version": "0.4.2",
+ "driver_version": "2.7.0",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8"
+ },
+ "model": {
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+ "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+ "model_name": null,
+ "model_note": null,
+ "model_source": "local",
+ "architecture": "dense",
+ "parameter_count_b": 0.5,
+ "precision": "BF16",
+ "effective_dtype": "float16",
+ "quantization_method": null,
+ "model_format": "HuggingFace original"
+ },
+ "task": {
+ "scenario": "interactive",
+ "num_runs": 3,
+ "warmup_runs": 1,
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "expert_parallel_size": 1,
+ "data_parallel_size": 1
+ },
+ "extra_config": null,
+ "runtime_metrics": null
+ },
+ "metrics": {
+ "interactive": {
+ "ttft_ms_p50": 25.89,
+ "ttft_ms_p90": 27.18,
+ "ttft_ms_p99": 28.51,
+ "tpot_ms_p50": 14.85,
+ "tpot_ms_p90": 15.17,
+ "tpot_ms_p99": 15.5,
+ "peak_memory_gb": null,
+ "elapsed_seconds_median": 481.4
+ }
+ },
+ "accuracy": {
+ "subset_score": null,
+ "baseline_delta": null,
+ "valid": false,
+ "notes": "Run --scenario accuracy to check model accuracy."
+ },
+ "meta": {
+ "submitted_by": "JuhaoLiang1997",
+ "submission_type": "individual",
+ "date": "2026-05-18",
+ "time": "17:21:09",
+ "run_id": "4f66d29d",
+ "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+ "flagged": null,
+ "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+ "env_info_file": "../env_info.json",
+ "log_file": "run.log",
+ "samples_file": "samples.jsonl",
+ "notes": null,
+ "benchmark_start_time": "2026-05-18T08:56:46.686185+00:00",
+ "benchmark_end_time": "2026-05-18T09:21:09.800661+00:00",
+ "benchmark_elapsed_minutes": 24.4,
+ "model_load_seconds": 151.2
+ }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json
new file mode 100644
index 0000000..2498167
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json
@@ -0,0 +1,164 @@
+{
+ "schema_version": "1.0",
+ "suite_id": "suite_F",
+ "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+ "chip": {
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "count": 1,
+ "memory_gb": 48.0,
+ "interconnect_intra_node": null,
+ "interconnect_inter_node": null
+ },
+ "environment": {
+ "collected_at": "2026-05-18T08:40:55.208034+00:00",
+ "accelerators": [
+ {
+ "index": 0,
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "memory_gb": 48.0,
+ "driver_version": "2.7.0",
+ "firmware_version": null,
+ "supports_bf16": true
+ }
+ ],
+ "accelerator_platform": "moorethreads",
+ "accelerator_topology": null,
+ "intra_node_interconnect": null,
+ "cpu": {
+ "model": "Intel(R) Xeon(R) Gold 6430",
+ "physical_cores": 64,
+ "logical_cores": 128,
+ "numa_nodes": 2
+ },
+ "system_memory_gb": 1007.5,
+ "pcie_generation": "PCIe 16x/16x",
+ "cpu_accelerator_bandwidth_gbs": null,
+ "network_interfaces": [
+ {
+ "name": "mlx5_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_1",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_bond_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ }
+ ],
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8",
+ "kernel_version": "5.15.0-105-generic",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "pytorch_version": "2.2.0"
+ },
+ "software": {
+ "framework": "vllm-musa",
+ "framework_version": "0.4.2",
+ "driver_version": "2.7.0",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8"
+ },
+ "model": {
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+ "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+ "model_name": null,
+ "model_note": null,
+ "model_source": "local",
+ "architecture": "dense",
+ "parameter_count_b": 0.5,
+ "precision": "BF16",
+ "effective_dtype": "float16",
+ "quantization_method": null,
+ "model_format": "HuggingFace original"
+ },
+ "task": {
+ "scenario": "offline",
+ "num_runs": 3,
+ "warmup_runs": 1,
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "expert_parallel_size": 1,
+ "data_parallel_size": 1
+ },
+ "extra_config": null,
+ "runtime_metrics": null
+ },
+ "metrics": {
+ "offline": {
+ "results_by_concurrency": [
+ {
+ "client_concurrency": 4,
+ "throughput_tokens_per_sec": 1994.51,
+ "throughput_tokens_per_sec_per_chip": 1994.51,
+ "throughput_tokens_per_sec_total": 3642.41,
+ "elapsed_seconds_median": 12.5,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ },
+ {
+ "client_concurrency": 16,
+ "throughput_tokens_per_sec": 1998.44,
+ "throughput_tokens_per_sec_per_chip": 1998.44,
+ "throughput_tokens_per_sec_total": 3649.59,
+ "elapsed_seconds_median": 12.5,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ },
+ {
+ "client_concurrency": 64,
+ "throughput_tokens_per_sec": 2004.02,
+ "throughput_tokens_per_sec_per_chip": 2004.02,
+ "throughput_tokens_per_sec_total": 3659.77,
+ "elapsed_seconds_median": 12.5,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ }
+ ]
+ }
+ },
+ "accuracy": {
+ "subset_score": null,
+ "baseline_delta": null,
+ "valid": false,
+ "notes": "Run --scenario accuracy to check model accuracy."
+ },
+ "meta": {
+ "submitted_by": "JuhaoLiang1997",
+ "submission_type": "individual",
+ "date": "2026-05-18",
+ "time": "16:48:27",
+ "run_id": "4f66d29d",
+ "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+ "flagged": null,
+ "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+ "env_info_file": "../env_info.json",
+ "log_file": "run.log",
+ "samples_file": "samples.jsonl",
+ "notes": null,
+ "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00",
+ "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00",
+ "benchmark_elapsed_minutes": 2.5,
+ "model_load_seconds": 146.8
+ }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json
new file mode 100644
index 0000000..eb13372
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json
@@ -0,0 +1,151 @@
+{
+ "schema_version": "1.0",
+ "suite_id": "suite_F",
+ "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+ "chip": {
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "count": 1,
+ "memory_gb": 48.0,
+ "interconnect_intra_node": null,
+ "interconnect_inter_node": null
+ },
+ "environment": {
+ "collected_at": "2026-05-18T08:40:55.208034+00:00",
+ "accelerators": [
+ {
+ "index": 0,
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "memory_gb": 48.0,
+ "driver_version": "2.7.0",
+ "firmware_version": null,
+ "supports_bf16": true
+ }
+ ],
+ "accelerator_platform": "moorethreads",
+ "accelerator_topology": null,
+ "intra_node_interconnect": null,
+ "cpu": {
+ "model": "Intel(R) Xeon(R) Gold 6430",
+ "physical_cores": 64,
+ "logical_cores": 128,
+ "numa_nodes": 2
+ },
+ "system_memory_gb": 1007.5,
+ "pcie_generation": "PCIe 16x/16x",
+ "cpu_accelerator_bandwidth_gbs": null,
+ "network_interfaces": [
+ {
+ "name": "mlx5_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_1",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_bond_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ }
+ ],
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8",
+ "kernel_version": "5.15.0-105-generic",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "pytorch_version": "2.2.0"
+ },
+ "software": {
+ "framework": "vllm-musa",
+ "framework_version": "0.4.2",
+ "driver_version": "2.7.0",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8"
+ },
+ "model": {
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+ "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+ "model_name": null,
+ "model_note": null,
+ "model_source": "local",
+ "architecture": "dense",
+ "parameter_count_b": 0.5,
+ "precision": "BF16",
+ "effective_dtype": "float16",
+ "quantization_method": null,
+ "model_format": "HuggingFace original"
+ },
+ "task": {
+ "scenario": "online",
+ "num_runs": 3,
+ "warmup_runs": 1,
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "expert_parallel_size": 1,
+ "data_parallel_size": 1
+ },
+ "extra_config": null,
+ "runtime_metrics": null
+ },
+ "metrics": {
+ "online": {
+ "sla_ttft_ms": 500,
+ "max_valid_qps": 40,
+ "results_by_qps": [
+ {
+ "target_qps": 10,
+ "achieved_qps": 10.0,
+ "ttft_ms_p50": 47.68,
+ "ttft_ms_p90": 96.31,
+ "ttft_ms_p99": 956.22,
+ "tpot_ms_p50": 47.25,
+ "tpot_ms_p90": 80.82,
+ "tpot_ms_p99": 131.63,
+ "elapsed_seconds_median": 37.8,
+ "sla_met": false
+ },
+ {
+ "target_qps": 40,
+ "achieved_qps": 40.0,
+ "ttft_ms_p50": 94.5,
+ "ttft_ms_p90": 194.64,
+ "ttft_ms_p99": 331.88,
+ "tpot_ms_p50": 74.76,
+ "tpot_ms_p90": 287.01,
+ "tpot_ms_p99": 444.19,
+ "elapsed_seconds_median": 19.0,
+ "sla_met": true
+ }
+ ]
+ }
+ },
+ "accuracy": {
+ "subset_score": null,
+ "baseline_delta": null,
+ "valid": false,
+ "notes": "Run --scenario accuracy to check model accuracy."
+ },
+ "meta": {
+ "submitted_by": "JuhaoLiang1997",
+ "submission_type": "individual",
+ "date": "2026-05-18",
+ "time": "16:53:54",
+ "run_id": "4f66d29d",
+ "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+ "flagged": null,
+ "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+ "env_info_file": "../env_info.json",
+ "log_file": "run.log",
+ "samples_file": "samples.jsonl",
+ "notes": null,
+ "benchmark_start_time": "2026-05-18T08:51:01.188901+00:00",
+ "benchmark_end_time": "2026-05-18T08:53:54.250762+00:00",
+ "benchmark_elapsed_minutes": 2.9,
+ "model_load_seconds": 132.6
+ }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json
new file mode 100644
index 0000000..a1c073d
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json
@@ -0,0 +1,215 @@
+{
+ "schema_version": "1.0",
+ "suite_id": "suite_F",
+ "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+ "chip": {
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "count": 1,
+ "memory_gb": 48.0,
+ "interconnect_intra_node": null,
+ "interconnect_inter_node": null
+ },
+ "environment": {
+ "collected_at": "2026-05-18T08:40:55.208034+00:00",
+ "accelerators": [
+ {
+ "index": 0,
+ "name": "MTT S4000",
+ "vendor": "Moore Threads",
+ "memory_gb": 48.0,
+ "driver_version": "2.7.0",
+ "firmware_version": null,
+ "supports_bf16": true
+ }
+ ],
+ "accelerator_platform": "moorethreads",
+ "accelerator_topology": null,
+ "intra_node_interconnect": null,
+ "cpu": {
+ "model": "Intel(R) Xeon(R) Gold 6430",
+ "physical_cores": 64,
+ "logical_cores": 128,
+ "numa_nodes": 2
+ },
+ "system_memory_gb": 1007.5,
+ "pcie_generation": "PCIe 16x/16x",
+ "cpu_accelerator_bandwidth_gbs": null,
+ "network_interfaces": [
+ {
+ "name": "mlx5_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_1",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ },
+ {
+ "name": "mlx5_bond_0",
+ "type": "InfiniBand/RoCE",
+ "bandwidth_gbps": null
+ }
+ ],
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8",
+ "kernel_version": "5.15.0-105-generic",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "pytorch_version": "2.2.0"
+ },
+ "software": {
+ "framework": "vllm-musa",
+ "framework_version": "0.4.2",
+ "driver_version": "2.7.0",
+ "runtime_version": "Moore Threads Driver 2.7.0",
+ "os": "Ubuntu Jammy Jellyfish (development branch)",
+ "python_version": "3.10.8"
+ },
+ "model": {
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+ "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+ "model_name": null,
+ "model_note": null,
+ "model_source": "local",
+ "architecture": "dense",
+ "parameter_count_b": 0.5,
+ "precision": "BF16",
+ "effective_dtype": "float16",
+ "quantization_method": null,
+ "model_format": "HuggingFace original"
+ },
+ "task": {
+ "scenarios_run": [
+ "offline",
+ "online",
+ "interactive"
+ ],
+ "parallelism": {
+ "tensor_parallel_size": 1,
+ "pipeline_parallel_size": 1,
+ "expert_parallel_size": 1,
+ "data_parallel_size": 1
+ },
+ "num_runs": 3,
+ "extra_config": null
+ },
+ "metrics": {
+ "derived": {},
+ "offline": {
+ "results_by_concurrency": [
+ {
+ "client_concurrency": 4,
+ "throughput_tokens_per_sec": 1994.51,
+ "throughput_tokens_per_sec_per_chip": 1994.51,
+ "throughput_tokens_per_sec_total": 3642.41,
+ "elapsed_seconds_median": 12.5,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ },
+ {
+ "client_concurrency": 16,
+ "throughput_tokens_per_sec": 1998.44,
+ "throughput_tokens_per_sec_per_chip": 1998.44,
+ "throughput_tokens_per_sec_total": 3649.59,
+ "elapsed_seconds_median": 12.5,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ },
+ {
+ "client_concurrency": 64,
+ "throughput_tokens_per_sec": 2004.02,
+ "throughput_tokens_per_sec_per_chip": 2004.02,
+ "throughput_tokens_per_sec_total": 3659.77,
+ "elapsed_seconds_median": 12.5,
+ "peak_memory_gb": null,
+ "power_watts_avg": null,
+ "power_watts_peak": null,
+ "oom": false,
+ "_throughput_note": "output_only",
+ "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+ }
+ ]
+ },
+ "online": {
+ "sla_ttft_ms": 500,
+ "max_valid_qps": 40,
+ "results_by_qps": [
+ {
+ "target_qps": 10,
+ "achieved_qps": 10.0,
+ "ttft_ms_p50": 47.68,
+ "ttft_ms_p90": 96.31,
+ "ttft_ms_p99": 956.22,
+ "tpot_ms_p50": 47.25,
+ "tpot_ms_p90": 80.82,
+ "tpot_ms_p99": 131.63,
+ "elapsed_seconds_median": 37.8,
+ "sla_met": false
+ },
+ {
+ "target_qps": 40,
+ "achieved_qps": 40.0,
+ "ttft_ms_p50": 94.5,
+ "ttft_ms_p90": 194.64,
+ "ttft_ms_p99": 331.88,
+ "tpot_ms_p50": 74.76,
+ "tpot_ms_p90": 287.01,
+ "tpot_ms_p99": 444.19,
+ "elapsed_seconds_median": 19.0,
+ "sla_met": true
+ }
+ ]
+ },
+ "interactive": {
+ "ttft_ms_p50": 25.89,
+ "ttft_ms_p90": 27.18,
+ "ttft_ms_p99": 28.51,
+ "tpot_ms_p50": 14.85,
+ "tpot_ms_p90": 15.17,
+ "tpot_ms_p99": 15.5,
+ "peak_memory_gb": null,
+ "elapsed_seconds_median": 481.4
+ }
+ },
+ "accuracy": {
+ "subset_score": 0.07,
+ "baseline_delta": -0.31,
+ "valid": false,
+ "framework": "vllm-musa",
+ "precision": "BF16",
+ "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+ },
+ "meta": {
+ "submitted_by": "JuhaoLiang1997",
+ "submission_type": "individual",
+ "date": "2026-05-18",
+ "time": "16:48:27",
+ "run_id": "4f66d29d",
+ "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+ "flagged": null,
+ "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+ "env_info_file": "../env_info.json",
+ "log_file": "run.log",
+ "samples_file": "samples.jsonl",
+ "notes": "Partial run: ['offline', 'online', 'interactive'] succeeded, ['accuracy'] failed.",
+ "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00",
+ "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00",
+ "benchmark_elapsed_minutes": 29.8,
+ "model_load_seconds": 146.8,
+ "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.",
+ "scenario_dirs": {
+ "offline": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline",
+ "online": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online",
+ "interactive": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive"
+ }
+ }
+}
\ No newline at end of file
diff --git a/runners/README.md b/runners/README.md
index 95290aa..aaf4d81 100644
--- a/runners/README.md
+++ b/runners/README.md
@@ -252,7 +252,7 @@ nvidia_trtllm_fp8_8d2f1a4b
amd_vllm_rocm_7b2e1d8f
ascend_mindie_9c4a3f11
apple_mlx_b3e21f09
-moorethreads_vllm_musa_57ff5443
+moorethreads_vllm_musa_f2f6f965
```
---
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/README.md b/runners/moorethreads_vllm_musa_f2f6f965/README.md
new file mode 100644
index 0000000..5111bdc
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/README.md
@@ -0,0 +1,145 @@
+# moorethreads_vllm_musa_f2f6f965 — Moore Threads MUSA Runner (vllm-musa)
+
+AccelMark runner for Moore Threads MUSA GPUs using
+[vllm-musa](https://github.com/MooreThreads/vllm-musa).
+
+## Supported suites
+
+| Suite | Description | Notes |
+|-------|-------------|-------|
+| Suite A | Single-chip, Llama-3-8B | Validated on S4000 (default: accuracy/offline/online) |
+| Suite B | Multi-chip, Llama-3-70B | MCCL tensor parallelism; set `VLLM_WORKER_MULTIPROC_METHOD=spawn` |
+| Suite C | Quantization, Llama-3.1-8B | FP8 skipped (not supported); W8A8/W8A16 via compressed-tensors |
+| Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` / `gpu_memory_utilization` in runner config |
+| Suite E | Multi-chip scaling, Llama-3-8B | MCCL tensor parallelism |
+| Suite F | Edge, Qwen2.5-0.5B | Validated on MTT S4000 (community result in repo) |
+| Suite G | MoE multi-chip, Mixtral-8x7B | Unsupported |
+
+## Hardware compatibility
+
+| GPU | BF16 / FP16 | Multi-chip TP | FP8 | Notes |
+|-----|-------------|---------------|-----|-------|
+| MTT S4000 / S5000 | ✅ (BF16 → float16 on vLLM < 0.10) | ✅ (MCCL) | ❌ | Tested with vLLM 0.4.x+musa |
+| MTT S3000 / S80 | ✅ | ✅ | ❌ | May need `--enforce-eager` on Triton errors |
+
+FP8 is excluded — not supported on this runner. FP32 inference fails with
+FlashAttention on MUSA (use FP16 or BF16). Qwen3 requires a newer vLLM + MUSA port
+(Qwen2.5 / Llama-3 work on 0.4.x).
+
+## Prerequisites
+
+Install in this order — **do not** `pip install torch` or `vllm` from PyPI on a
+bare Linux host:
+
+**1. MUSA toolkit + driver**
+
+
+
+**2. vllm-musa (official build)**
+
+| Resource | URL |
+|----------|-----|
+| Repository | |
+| Build guide | [README_vllm_musa.md](https://github.com/MooreThreads/vllm-musa/blob/main/README_vllm_musa.md) |
+| PyTorch MUSA | |
+
+```bash
+git clone https://github.com/MooreThreads/vllm-musa.git
+cd vllm-musa
+bash build_musa.sh
+python -c "from vllm import LLM; print('vllm ok')"
+```
+
+**3. Runner dependencies**
+
+```bash
+pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
+```
+
+Pin `transformers` to **4.40–4.46** (not 5.x) when on vLLM 0.4.x.
+
+**Environment variables**
+
+```bash
+export MUSA_VISIBLE_DEVICES=0
+export VLLM_WORKER_MULTIPROC_METHOD=spawn # when tensor_parallel_size > 1
+```
+
+## Smoke test
+
+```bash
+python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model
+```
+
+## Accuracy
+
+AccelMark runs an integrated MMLU subset after each benchmark using the **same**
+vLLM instance as the perf run. The runner sets `device=musa`, dtype, and
+tokenizer correctly; low scores on vLLM **0.4.x+musa** reflect broken generation
+in that stack, not missing AccelMark wiring.
+
+| Model | Suite | Measured | Baseline |
+|-------|-------|----------|----------|
+| Qwen2.5-0.5B-Instruct | F | **~0.07** | 0.37 (FP16) / 0.38 (BF16) |
+| Llama-3-8B-Instruct | A | **~0.07** | 0.60 (BF16) |
+
+Throughput completes normally; answers are effectively random (repetition, system
+prompt regurgitation, similar ~7% across different models).
+
+While accuracy is broken on 0.4.x, use `--skip-accuracy-gate` to finish a perf run:
+
+```bash
+python run.py --runner moorethreads_vllm_musa_f2f6f965 \
+ --suite suite_F --precision FP16 --skip-accuracy-gate
+```
+
+Likely fix: upgrade to vllm-musa aligned with vLLM **0.10+**, keep
+`transformers` 4.40–4.46 on legacy forks, then re-run without
+`--skip-accuracy-gate`.
+
+## Usage
+
+```bash
+python run.py --runner moorethreads_vllm_musa_f2f6f965 --suite suite_F --precision FP16
+
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+python run.py --runner moorethreads_vllm_musa_f2f6f965 \
+ --suite suite_B --tensor-parallel-size 8
+```
+
+Optional runner config (copy and edit):
+
+```bash
+cp configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example \
+ configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml
+```
+
+| Field | Default | Notes |
+|-------|---------|-------|
+| `tensor_parallel_size` | 1 | MCCL tensor parallelism |
+| `enforce_eager` | false | Only if Triton / graph capture errors |
+| `max_num_seqs` | 256 | Lower on small HBM |
+| `gpu_memory_utilization` | 0.85 | Lower if OOM |
+
+## Troubleshooting
+
+| Symptom | Fix |
+|---------|-----|
+| `GLIBCXX_3.4.30` on import | Import `torch` before `transformers` (runner and smoke test do this) |
+| `KeyError: 'type'` in rope_scaling | Pin `transformers==4.46.3` (not 5.x) |
+| `Expected musa device, got cuda:0` | Use this runner (`device="musa"`) |
+| MMLU ~0.07 | See [Accuracy](#accuracy); `--skip-accuracy-gate` for perf-only runs |
+| OOM | Lower `gpu_memory_utilization` / `max_num_seqs` |
+| Triton / graph errors | `--enforce-eager` or `enforce_eager: true` in runner YAML |
+
+## Requirements
+
+See `requirements.txt` for AccelMark extras. vLLM, torch_musa, and the MUSA
+driver are installed per the official vllm-musa guide above (not from this file).
+
+Minimum environment:
+
+- Moore Threads GPU with MUSA driver
+- Python 3.10+
+- vllm-musa build per [MooreThreads/vllm-musa](https://github.com/MooreThreads/vllm-musa)
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/meta.json b/runners/moorethreads_vllm_musa_f2f6f965/meta.json
new file mode 100644
index 0000000..e57d72d
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/meta.json
@@ -0,0 +1,21 @@
+{
+ "id": "moorethreads_vllm_musa_f2f6f965",
+ "platform": "moorethreads",
+ "name": "vllm-musa on Moore Threads MUSA GPU",
+ "framework": "vllm-musa",
+ "submitted_by": "JuhaoLiang1997",
+ "description": "AccelMark runner for Moore Threads MUSA GPUs using vllm-musa (https://github.com/MooreThreads/vllm-musa). Install torch/vllm per upstream README_vllm_musa.md; requirements.txt adds benchmark deps only. Sets device=musa; BF16 maps to float16 on vLLM <0.10. MCCL tensor parallelism. FP8 unsupported.",
+ "supersedes_chain": [],
+ "notes": "Smoke-tested on MTT S4000 (vLLM 0.4.2+musa): Suite A and F default scenarios run. MMLU not at baseline — see runner README.",
+ "created": "2026-05-18",
+ "hardware_label": null,
+ "suite_support": {
+ "A": "validated",
+ "B": "pending",
+ "C": "pending",
+ "D": "pending",
+ "E": "pending",
+ "F": "validated",
+ "G": "unsupported"
+ }
+}
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
new file mode 100644
index 0000000..1fe16ee
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
@@ -0,0 +1,22 @@
+# AccelMark — moorethreads_vllm_musa_f2f6f965
+#
+# AccelMark benchmark dependencies only. Install MUSA toolkit, torch_musa, and
+# vllm-musa first — see README.md and https://github.com/MooreThreads/vllm-musa
+#
+# pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
+# python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+
+# AccelMark / loadgen
+numpy==1.26.4
+jsonschema==4.25.1
+psutil==7.1.0
+tqdm==4.67.1
+aiohttp==3.12.15
+PyYAML==6.0.2
+
+# Tokenizer / config (pin to match vLLM 0.4.x — see README)
+transformers>=4.43.0,<4.47.0
+tokenizers>=0.20.0,<0.21.0
+huggingface-hub>=0.26.0,<0.27.0
+accelerate>=1.2.0,<1.3.0
+safetensors>=0.4.5,<0.5.0
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/runner.py b/runners/moorethreads_vllm_musa_f2f6f965/runner.py
new file mode 100644
index 0000000..b693369
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/runner.py
@@ -0,0 +1,440 @@
+"""
+AccelMark — Moore Threads MUSA vLLM benchmark runner (vllm-musa).
+
+Implements BenchmarkRunner for vllm-musa on Moore Threads MUSA GPUs.
+See README.md in this folder for install and hardware notes.
+"""
+
+import asyncio
+import gc
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(_REPO_ROOT))
+
+from runners.benchmark_runner import BenchmarkRunner, InferenceRequest
+from loadgen.types import InferenceResult
+
+import logging
+logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING)
+logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING)
+
+
+class MoorethreadsVLLMMUSARunner(BenchmarkRunner):
+ """vLLM on Moore Threads MUSA via vllm-musa."""
+
+ SUPPORTS_STREAMING = True
+ SUPPORTS_BATCHING = True
+ SUPPORTS_ONLINE = True
+ SUPPORTS_MULTI_CHIP = True
+
+ SUPPORTED_PRECISIONS = ["bf16", "fp16"]
+ SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors"]
+
+ _musa_runtime_prepared = False
+
+ def __init__(self):
+ self.llm = None
+ self.engine = None
+ self.tokenizer = None
+ self.sampling_params = None
+ self._loop: asyncio.AbstractEventLoop = None
+
+ def _get_chip_count(self) -> int:
+ try:
+ import pymtml
+ pymtml.mtmlInit()
+ try:
+ n = pymtml.mtmlDeviceGetCount()
+ finally:
+ try:
+ pymtml.mtmlShutdown()
+ except Exception:
+ pass
+ if n and n > 0:
+ return int(n)
+ except Exception:
+ pass
+ try:
+ import torch
+ n = torch.cuda.device_count()
+ return n if n > 0 else 1
+ except Exception:
+ return 1
+
+ def _get_framework_name(self) -> str:
+ return "vllm-musa"
+
+ def _get_framework_version(self) -> str:
+ plugin_version = "unknown"
+ try:
+ from importlib.metadata import version
+ plugin_version = version("vllm-musa")
+ except Exception:
+ try:
+ import vllm_musa_platform # type: ignore
+ plugin_version = getattr(vllm_musa_platform, "__version__", "unknown")
+ except Exception:
+ pass
+ try:
+ import vllm
+ core_version = vllm.__version__
+ except Exception:
+ core_version = "unknown"
+ if plugin_version == "unknown" and core_version == "unknown":
+ return "unknown"
+ if plugin_version == "unknown":
+ return core_version
+ return f"{plugin_version}+vllm-{core_version}"
+
+ def get_model_format(self) -> str:
+ return "HuggingFace original"
+
+ @classmethod
+ def _prepare_musa_runtime(cls) -> None:
+ if cls._musa_runtime_prepared:
+ return
+ import torch # noqa: F401
+ cls._musa_runtime_prepared = True
+
+ @staticmethod
+ def _legacy_vllm_musa() -> bool:
+ try:
+ import vllm
+ ver = vllm.__version__.split("+")[0]
+ major, minor = (int(x) for x in ver.split(".")[:2])
+ return (major, minor) < (0, 10)
+ except Exception:
+ return True
+
+ @staticmethod
+ def _get_engine_arg_fields() -> set[str]:
+ try:
+ import dataclasses
+ from vllm.engine.arg_utils import EngineArgs
+ return {f.name for f in dataclasses.fields(EngineArgs)}
+ except Exception:
+ return set()
+
+ def _resolve_musa_dtype(self, dtype: str, precision: str) -> str:
+ if not self._legacy_vllm_musa():
+ return dtype
+ if dtype in ("bfloat16", "auto") or precision.upper() == "BF16":
+ if dtype != "float16":
+ print(" Note: vLLM 0.4.x+musa — using float16")
+ return "float16"
+ return dtype
+
+ def load_model(self, model_path: str, parallelism: dict) -> None:
+ self._prepare_musa_runtime()
+
+ from transformers import AutoTokenizer
+ from vllm import LLM, AsyncLLMEngine, SamplingParams
+ from vllm.engine.arg_utils import AsyncEngineArgs
+
+ tp_size = parallelism["tensor_parallel_size"]
+ pp_size = parallelism["pipeline_parallel_size"]
+ ep_size = parallelism.get("expert_parallel_size", 1)
+ assert pp_size <= 1, (
+ "Pipeline parallelism is not supported. Use --tensor-parallel-size."
+ )
+
+ max_tokens = parallelism["max_tokens"]
+ max_model_len = parallelism["max_model_len"]
+ use_async = parallelism["use_async"]
+ enforce_eager = getattr(self, "_enforce_eager", False)
+
+ cfg = getattr(self, "_runner_config", {})
+ max_num_seqs = cfg.get("max_num_seqs", 256)
+ musa_memory_util = cfg.get("gpu_memory_utilization", 0.85)
+ extra_kwargs = dict(cfg.get("engine_kwargs") or {})
+
+ _valid_engine_fields = self._get_engine_arg_fields()
+ if _valid_engine_fields:
+ _dropped = {k: v for k, v in extra_kwargs.items()
+ if k not in _valid_engine_fields}
+ if _dropped:
+ print(f" Warning: engine_kwargs keys not supported by this "
+ f"vllm-musa / vLLM version and will be ignored: "
+ f"{list(_dropped)}")
+ extra_kwargs = {k: v for k, v in extra_kwargs.items()
+ if k in _valid_engine_fields}
+
+ effective_precision = getattr(self, "_effective_precision", "BF16").upper()
+ precision = getattr(self, "_precision", None) or effective_precision
+ _dtype_override = getattr(self, "_precision_dtype_override", None)
+ _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {})
+ quantization = _prec_eng_kwargs.pop("quantization", None)
+
+ _NATIVE_DTYPE_MAP = {"BF16": "bfloat16", "FP16": "float16", "FP32": "float32"}
+ dtype = _NATIVE_DTYPE_MAP.get(precision, "auto")
+ self._quantization_method = quantization
+
+ if _dtype_override:
+ dtype = _dtype_override
+ dtype = self._resolve_musa_dtype(dtype, precision)
+ if _prec_eng_kwargs:
+ _prec_eng_kwargs.update(extra_kwargs)
+ extra_kwargs = _prec_eng_kwargs
+
+ if "speculative_model" in extra_kwargs and "speculative_config" not in extra_kwargs:
+ extra_kwargs["speculative_config"] = {
+ "model": extra_kwargs.pop("speculative_model"),
+ "num_speculative_tokens": extra_kwargs.pop("num_speculative_tokens", 4),
+ "draft_tensor_parallel_size": extra_kwargs.pop(
+ "speculative_draft_tensor_parallel_size", 1
+ ),
+ }
+
+ print(
+ f"Loading model: precision={precision}, dtype={dtype}"
+ + (f", quantization_method={self._quantization_method}"
+ if self._quantization_method else "")
+ )
+
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ model_path, trust_remote_code=False
+ )
+ self.sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
+
+ base_kwargs = dict(
+ model=model_path,
+ dtype=dtype,
+ tensor_parallel_size=tp_size,
+ trust_remote_code=False,
+ enforce_eager=enforce_eager,
+ )
+ if not _valid_engine_fields or "device" in _valid_engine_fields:
+ base_kwargs["device"] = "musa"
+ if ep_size > 1:
+ base_kwargs["enable_expert_parallel"] = True
+ if quantization:
+ base_kwargs["quantization"] = quantization
+ if max_model_len:
+ base_kwargs["max_model_len"] = max_model_len
+
+ if not use_async:
+ self.llm = LLM(**{
+ **base_kwargs,
+ "max_num_seqs": max_num_seqs,
+ "gpu_memory_utilization": musa_memory_util,
+ **extra_kwargs,
+ })
+ else:
+ self._loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(self._loop)
+ engine_args = AsyncEngineArgs(**{
+ **base_kwargs,
+ "gpu_memory_utilization": musa_memory_util,
+ **extra_kwargs,
+ })
+ self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+ def get_effective_dtype(self) -> Optional[str]:
+ try:
+ if self.llm is not None:
+ return str(self.llm.llm_engine.model_config.dtype).replace("torch.", "")
+ if self.engine is not None:
+ return str(self.engine.engine.model_config.dtype).replace("torch.", "")
+ except Exception:
+ pass
+ return getattr(self, "_effective_dtype", None)
+
+ def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]:
+ formatted = [self._format_prompt(r.prompt) for r in requests]
+ t_start = time.perf_counter()
+ outputs = self.llm.generate(formatted, self.sampling_params)
+ elapsed = time.perf_counter() - t_start
+
+ self._last_accuracy_outputs = [o.outputs[0].text for o in outputs]
+
+ return [
+ InferenceResult(
+ first_token_time_ms=None,
+ total_time_ms=elapsed * 1000,
+ output_tokens=len(o.outputs[0].token_ids),
+ input_tokens=len(o.prompt_token_ids),
+ success=True,
+ output_text=o.outputs[0].text,
+ )
+ for o in outputs
+ ]
+
+ async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult:
+ from vllm.utils import random_uuid
+
+ formatted = self._format_prompt(request.prompt)
+ request_id = random_uuid()
+ t_start = time.perf_counter()
+ first_token_time_ms = None
+ output_tokens = 0
+ output_text = ""
+
+ async for output in self.engine.generate(
+ formatted, self.sampling_params, request_id
+ ):
+ if first_token_time_ms is None and len(output.outputs[0].token_ids) > 0:
+ first_token_time_ms = (time.perf_counter() - t_start) * 1000
+ output_tokens = len(output.outputs[0].token_ids)
+ output_text = output.outputs[0].text
+
+ return InferenceResult(
+ first_token_time_ms=first_token_time_ms,
+ total_time_ms=(time.perf_counter() - t_start) * 1000,
+ output_tokens=output_tokens,
+ input_tokens=0,
+ success=True,
+ output_text=output_text,
+ )
+
+ async def inference_fn_token_stream(self, request: InferenceRequest):
+ from vllm.utils import random_uuid
+
+ formatted = self._format_prompt(request.prompt)
+ request_id = random_uuid()
+ prev_length = 0
+
+ async for output in self.engine.generate(
+ formatted, self.sampling_params, request_id
+ ):
+ current_text = output.outputs[0].text
+ delta = current_text[prev_length:]
+ if delta:
+ yield delta
+ prev_length = len(current_text)
+
+ def get_peak_memory_gb(self) -> Optional[float]:
+ try:
+ import torch
+ return torch.cuda.max_memory_allocated() / (1024 ** 3)
+ except Exception:
+ pass
+ try:
+ import pymtml
+ pymtml.mtmlInit()
+ try:
+ dev = pymtml.mtmlDeviceGetByIndex(0)
+ info = pymtml.mtmlDeviceGetMemoryInfo(dev)
+ used = getattr(info, "used", None)
+ if used is not None:
+ return float(used) / (1024 ** 3)
+ finally:
+ try:
+ pymtml.mtmlShutdown()
+ except Exception:
+ pass
+ except Exception:
+ pass
+ return None
+
+ def release_resources(self) -> None:
+ if self.llm is not None:
+ try:
+ del self.llm
+ except Exception:
+ pass
+ self.llm = None
+
+ if self.engine is not None:
+ try:
+ if self._loop and not self._loop.is_closed():
+ self._loop.run_until_complete(self.engine.shutdown())
+ except Exception:
+ pass
+ try:
+ del self.engine
+ except Exception:
+ pass
+ self.engine = None
+
+ try:
+ from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+ cleanup_dist_env_and_memory(shutdown_ray=False)
+ except Exception:
+ try:
+ from vllm.distributed.parallel_state import (
+ destroy_model_parallel,
+ destroy_distributed_environment,
+ )
+ destroy_model_parallel()
+ destroy_distributed_environment()
+ except Exception:
+ pass
+
+ try:
+ import torch
+ if torch.distributed.is_initialized():
+ torch.distributed.destroy_process_group()
+ except Exception:
+ pass
+
+ gc.collect()
+
+ try:
+ import torch
+ torch.cuda.empty_cache()
+ torch.cuda.reset_peak_memory_stats()
+ except Exception:
+ pass
+
+ def parse_args(self):
+ """Add vllm-musa-specific CLI flags. Base class pre-loads runner config."""
+ args = super().parse_args()
+ cfg = self._runner_config
+
+ import argparse
+ parser = argparse.ArgumentParser(add_help=False)
+ parser.add_argument("--tensor-parallel-size", type=int, default=None,
+ dest="tensor_parallel_size")
+ parser.add_argument("--expert-parallel-size", type=int, default=None,
+ dest="expert_parallel_size")
+ parser.add_argument("--enforce-eager", action="store_true", default=False,
+ dest="enforce_eager")
+ extra, _ = parser.parse_known_args()
+
+ tp_size, _tp_source = self._resolve_tensor_parallel_size(
+ extra.tensor_parallel_size
+ )
+ ep_size = (extra.expert_parallel_size
+ if extra.expert_parallel_size is not None
+ else cfg.get("expert_parallel_size", 1))
+ self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False)
+
+ print(f" tensor_parallel_size = {tp_size} [{_tp_source}]")
+ if ep_size > 1:
+ print(f" expert_parallel_size = {ep_size} [cli/yaml]")
+
+ if not self.SUPPORTS_MULTI_CHIP and tp_size > 1:
+ print(f"Warning: {self.__class__.__name__} does not support multi-chip. "
+ f"Ignoring tensor_parallel_size={tp_size}, using 1.")
+ tp_size = 1
+ ep_size = 1
+
+ self._parallelism = {
+ "tensor_parallel_size": tp_size,
+ "pipeline_parallel_size": 1,
+ "expert_parallel_size": ep_size,
+ "data_parallel_size": 1,
+ }
+ self._chip_count = tp_size
+ self._precision = getattr(args, "precision", None)
+ return args
+
+ def get_extra_subprocess_args(self, args) -> list[str]:
+ extra = [
+ "--tensor-parallel-size",
+ str(self._parallelism.get("tensor_parallel_size", 1)),
+ ]
+ if self._parallelism.get("expert_parallel_size", 1) > 1:
+ extra += ["--expert-parallel-size",
+ str(self._parallelism["expert_parallel_size"])]
+ if self._enforce_eager:
+ extra += ["--enforce-eager"]
+ return extra
+
+
+if __name__ == "__main__":
+ MoorethreadsVLLMMUSARunner().main()
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
new file mode 100644
index 0000000..86cbbf9
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""
+Standalone vllm-musa smoke test (does not use the AccelMark runner).
+
+Usage (from repo root):
+
+ python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+ python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model
+
+ MODEL_PATH=/path/to/Qwen2.5-0.5B-Instruct \\
+ python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+"""
+
+from __future__ import annotations
+
+import gc
+import os
+import sys
+import time
+
+import torch # noqa: F401 — before transformers/vllm (libstdc++ load order)
+
+from vllm import LLM, SamplingParams
+
+_DEFAULT_MODEL = os.getenv("MODEL_PATH", "Qwen/Qwen2.5-0.5B-Instruct")
+
+PROMPTS = [
+ "The capital of France is",
+ "Say hello in one short sentence.",
+]
+
+
+def main() -> int:
+ model_path = sys.argv[1] if len(sys.argv) > 1 else _DEFAULT_MODEL
+
+ sampling_params = SamplingParams(temperature=0.0, max_tokens=64)
+
+ print(f"Loading {model_path} ...")
+ t_load = time.perf_counter()
+ llm = LLM(
+ model=model_path,
+ device="musa",
+ dtype="float16",
+ tensor_parallel_size=1,
+ max_model_len=1024,
+ max_num_seqs=4,
+ gpu_memory_utilization=0.85,
+ trust_remote_code=False,
+ )
+ print(f"Model loaded in {time.perf_counter() - t_load:.1f}s\n")
+
+ t_infer = time.perf_counter()
+ outputs = llm.generate(PROMPTS, sampling_params)
+ print(f"Inference done in {time.perf_counter() - t_infer:.1f}s\n")
+
+ for prompt, output in zip(PROMPTS, outputs):
+ text = output.outputs[0].text
+ n_tokens = len(output.outputs[0].token_ids)
+ print(f"Prompt: {prompt!r}")
+ print(f"Output: {text!r}")
+ print(f"Tokens: {n_tokens}\n")
+
+ del llm
+ gc.collect()
+ try:
+ if hasattr(torch, "musa"):
+ torch.musa.empty_cache()
+ else:
+ torch.cuda.empty_cache()
+ except Exception:
+ pass
+ print("Done.")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/runners/platforms/moorethreads.py b/runners/platforms/moorethreads.py
index 708db1b..9f55684 100644
--- a/runners/platforms/moorethreads.py
+++ b/runners/platforms/moorethreads.py
@@ -1,17 +1,13 @@
"""Moore Threads MUSA GPU platform plug-in.
-Moore Threads ships its own driver and management tooling:
-
-* ``mthreads-gmi`` — the moral equivalent of ``nvidia-smi`` / ``rocm-smi``.
-* ``pymtml`` — Python bindings analogous to NVML / pynvml.
-* ``torchada`` — a CUDA→MUSA compatibility shim that exposes the standard
- ``torch.cuda`` API, with the real backend version available via
- ``torch.version.musa``.
-
-This plug-in first tries the Python bindings (best machine-readable
-output) and falls back to scraping ``mthreads-gmi`` text output. Both
-paths are best-effort: when none of the tools are installed the plug-in
-silently reports zero accelerators and the collector moves on.
+Used by ``runners/collect_env.py`` to populate ``env_info.json``.
+
+Detection order (first non-empty wins):
+
+ 1. ``pymtml`` (mthreads-ml-py) — same API as used in the vllm-musa runner
+ 2. ``mthreads-gmi`` text output
+ 3. ``torch`` device properties (``torch.cuda`` aliased to MUSA via torchada,
+ or native ``torch.musa`` when available)
"""
from __future__ import annotations
@@ -23,8 +19,6 @@
VENDOR_LABEL = "Moore Threads"
PRIORITY = 60
-# S5000 / S4000 datacenter SKUs ship with native BF16 support; the older
-# consumer-class MTT S80/S70 cards are FP16-only.
_BF16_SUPPORTED_HINTS = ("s5000", "s4000", "s3000")
_NO_BF16_HINTS = ("s80", "s70", "s60", "s50")
@@ -40,50 +34,68 @@ def _supports_bf16(chip_name: str) -> bool:
return True
+def _driver_version_from_smi() -> str | None:
+ try:
+ out = subprocess.check_output(
+ ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
+ )
+ m = re.search(r"Driver\s+Version\s*:\s*(\S+)", out, re.IGNORECASE)
+ if m:
+ return m.group(1)
+ except Exception:
+ pass
+ return None
+
+
def _collect_via_pymtml() -> list[dict]:
try:
- import pymtml as mtml # type: ignore[import-not-found]
+ import pymtml
except ImportError:
return []
try:
- mtml.mtmlInit()
+ pymtml.mtmlInit()
except Exception:
return []
+ driver = _driver_version_from_smi() or "unknown"
accelerators: list[dict] = []
try:
- count = mtml.mtmlDeviceGetCount()
+ count = pymtml.mtmlDeviceGetCount()
except Exception:
try:
- mtml.mtmlShutdown()
+ pymtml.mtmlShutdown()
except Exception:
pass
return []
for idx in range(int(count)):
try:
- handle = mtml.mtmlDeviceGetHandleByIndex(idx)
- name = mtml.mtmlDeviceGetName(handle)
- mem = mtml.mtmlDeviceGetMemoryInfo(handle)
- total_mb = getattr(mem, "total", None) or mem.get("total", 0)
- driver = mtml.mtmlSystemGetDriverVersion()
+ dev = pymtml.mtmlDeviceGetByIndex(idx)
+ name = pymtml.mtmlDeviceGetName(dev)
+ mem = pymtml.mtmlDeviceGetMemoryInfo(dev)
+ total_bytes = getattr(mem, "total", None)
+ if total_bytes is None and isinstance(mem, dict):
+ total_bytes = mem.get("total")
except Exception:
continue
+ if not isinstance(name, str):
+ name = name.decode("utf-8", "ignore")
+ memory_gb = round(int(total_bytes) / (1024 ** 3), 1) if total_bytes else None
accelerators.append(
{
"index": idx,
- "name": name if isinstance(name, str) else name.decode("utf-8", "ignore"),
+ "name": name,
"vendor": VENDOR_LABEL,
- "memory_gb": round(int(total_mb) / 1024, 1) if total_mb else None,
- "driver_version": driver if isinstance(driver, str) else driver.decode("utf-8", "ignore"),
+ "memory_gb": memory_gb,
+ "driver_version": driver,
"firmware_version": None,
- "supports_bf16": _supports_bf16(str(name)),
+ "supports_bf16": _supports_bf16(name),
}
)
try:
- mtml.mtmlShutdown()
+ pymtml.mtmlShutdown()
except Exception:
pass
@@ -91,12 +103,7 @@ def _collect_via_pymtml() -> list[dict]:
def _collect_via_smi() -> list[dict]:
- """Fallback parser for ``mthreads-gmi`` text output.
-
- The output format mirrors nvidia-smi: a header with the driver / MUSA
- versions followed by per-device blocks listing the product name and
- memory usage. We only need the device name and total memory.
- """
+ """Parse ``mthreads-gmi`` text output (mthreads-gmi 1.14+ tabular format)."""
try:
out = subprocess.check_output(
["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
@@ -110,21 +117,18 @@ def _collect_via_smi() -> list[dict]:
driver = m.group(1)
accelerators: list[dict] = []
- # Per-device rows look like:
- # | 0 MTT S4000 ... | 0000:65:00.0 Off | ... |
- # followed by:
- # | 0% 45C P0 ... / ... | 234MiB / 49152MiB | ... |
+ # Example row:
+ # 0 MTT S4000 |00000000:28:00.0 |0% 4MiB(49152MiB)
for match in re.finditer(
- r"\|\s*(\d+)\s+(MTT\s+\S+(?:\s+\S+)?)\s*", out
+ r"^(\d+)\s+(MTT\s+\S+)\s+\|",
+ out,
+ re.MULTILINE,
):
idx = int(match.group(1))
name = match.group(2).strip()
- # Search downstream of this match for the memory line
- tail = out[match.end():]
- mem_match = re.search(r"(\d+)MiB\s*/\s*(\d+)MiB", tail)
- memory_gb = None
- if mem_match:
- memory_gb = round(int(mem_match.group(2)) / 1024, 1)
+ tail = out[match.end(): match.end() + 256]
+ mem_match = re.search(r"\d+MiB\((\d+)MiB\)", tail)
+ memory_gb = round(int(mem_match.group(1)) / 1024, 1) if mem_match else None
accelerators.append(
{
"index": idx,
@@ -139,23 +143,69 @@ def _collect_via_smi() -> list[dict]:
return accelerators
+def _collect_via_torch() -> list[dict]:
+ """Fallback when management libraries are missing but torch MUSA is loaded."""
+ try:
+ import torch
+ except ImportError:
+ return []
+
+ driver = _driver_version_from_smi() or "unknown"
+ accelerators: list[dict] = []
+
+ if hasattr(torch, "musa"):
+ try:
+ count = torch.musa.device_count()
+ get_props = torch.musa.get_device_properties
+ except Exception:
+ count = 0
+ get_props = None
+ else:
+ try:
+ count = torch.cuda.device_count()
+ get_props = torch.cuda.get_device_properties
+ except Exception:
+ return []
+
+ for idx in range(int(count)):
+ try:
+ props = get_props(idx)
+ name = getattr(props, "name", None) or f"MTT GPU {idx}"
+ total = getattr(props, "total_memory", None)
+ memory_gb = round(total / (1024 ** 3), 1) if total else None
+ except Exception:
+ continue
+ accelerators.append(
+ {
+ "index": idx,
+ "name": name if isinstance(name, str) else str(name),
+ "vendor": VENDOR_LABEL,
+ "memory_gb": memory_gb,
+ "driver_version": driver,
+ "firmware_version": None,
+ "supports_bf16": _supports_bf16(str(name)),
+ }
+ )
+ return accelerators
+
+
def collect() -> list[dict]:
- accelerators = _collect_via_pymtml()
- if accelerators:
- return accelerators
- return _collect_via_smi()
+ for fn in (_collect_via_pymtml, _collect_via_smi, _collect_via_torch):
+ accelerators = fn()
+ if accelerators:
+ return accelerators
+ return []
def detect_runtime_version() -> str | None:
- """Prefer torch.version.musa (most reliable when torchada is installed),
- fall back to scraping ``mthreads-gmi`` header.
- """
try:
import torch
ver = getattr(torch.version, "musa", None)
if ver:
return f"MUSA {ver}"
+ if getattr(torch.version, "cuda", None):
+ return f"MUSA (torch.cuda shim) {torch.version.cuda}"
except ImportError:
pass
@@ -174,17 +224,43 @@ def detect_runtime_version() -> str | None:
return None
+def detect_pcie_gen() -> str | None:
+ try:
+ out = subprocess.check_output(
+ ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
+ )
+ m = re.search(r"\|\s*(\d+)x\((\d+)x\)\s*\|", out)
+ if m:
+ return f"PCIe {m.group(1)}x/{m.group(2)}x"
+ except Exception:
+ pass
+ return None
+
+
+def detect_intra_node_interconnect() -> str | None:
+ """Moore Threads multi-GPU hosts typically use MCCL over PCIe."""
+ accels = collect()
+ if len(accels) > 1:
+ return "MCCL/PCIe"
+ return None
+
+
def diagnostics(env: dict, accelerators: list[dict]) -> list[str]:
notes: list[str] = []
- if accelerators and (env.get("pytorch_version") or "") == "unknown":
+ if not accelerators:
+ notes.append(
+ "No Moore Threads MUSA GPUs detected (tried pymtml, mthreads-gmi, "
+ "and torch). Install the MUSA driver/toolkit per "
+ "https://github.com/MooreThreads/vllm-musa ."
+ )
+ return notes
+ if (env.get("pytorch_version") or "") == "unknown":
notes.append(
- "PyTorch (with the torchada MUSA shim) is not installed — "
- "pytorch_version is unknown."
+ "PyTorch with MUSA support is not installed — pytorch_version is unknown."
)
- if accelerators and (env.get("runtime_version") or "") == "unknown":
+ if (env.get("runtime_version") or "") == "unknown":
notes.append(
"Could not detect MUSA runtime (tried torch.version.musa and "
- "mthreads-gmi). runtime_version is unknown — install torchada "
- "or the Moore Threads MUSA toolkit."
+ "mthreads-gmi). runtime_version is unknown."
)
return notes
diff --git a/schema/env.schema.json b/schema/env.schema.json
index 60fc5e8..e80cd94 100644
--- a/schema/env.schema.json
+++ b/schema/env.schema.json
@@ -16,7 +16,7 @@
"properties": {
"index": { "type": "integer" },
"name": { "type": "string" },
- "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple'" },
+ "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple', 'Moore Threads'" },
"memory_gb": { "type": ["number","null"], "minimum": 0 },
"driver_version": { "type": "string" },
"firmware_version": { "type": ["string","null"] },
diff --git a/schema/result.schema.json b/schema/result.schema.json
index 99a0517..fb81a8a 100644
--- a/schema/result.schema.json
+++ b/schema/result.schema.json
@@ -36,7 +36,7 @@
"vendor": {
"type": "string",
"enum": ["NVIDIA","AMD","Intel","Google","Huawei","Cambricon","Biren",
- "Enflame","MetaX","Iluvatar","Apple","Qualcomm","Other"]
+ "Enflame","MetaX","Moore Threads","Iluvatar","Apple","Qualcomm","Other"]
},
"count": { "type": "integer", "minimum": 1 },
"memory_gb": { "type": "number", "minimum": 0 },