diff --git a/README.md b/README.md
index ea9e2b6..3007966 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
 | Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — |
+| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ✓ | ⋯ | ⋯ | ⋯ | ⋯ | ✓ | — |
 
 _Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._
 <!-- platforms-matrix:end -->
diff --git a/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example
new file mode 100644
index 0000000..c18f98b
--- /dev/null
+++ b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example
@@ -0,0 +1,60 @@
+# AccelMark runner config — moorethreads_vllm_musa_f2f6f965 (vllm-musa on Moore Threads)
+#
+# Copy this file to runner_moorethreads_vllm_musa_f2f6f965.yaml (remove
+# .example suffix) and edit as needed for your hardware. The actual .yaml
+# is gitignored.
+#
+# These settings adapt the runner to your hardware environment. They are
+# recorded in result.json task.extra_config for transparency but are NOT
+# part of the benchmark identity (not hashed into run_id).
+#
+# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
+
+# ── Global defaults (apply to all suites) ─────────────────────────────────────
+
+# Tensor parallel size — number of Moore Threads GPUs to use (default: 1).
+# For multi-card runs make sure to export VLLM_WORKER_MULTIPROC_METHOD=spawn.
+tensor_parallel_size: 1
+
+# Disable Triton CUDA-graph / compilation. Set true if you hit Triton kernel
+# errors on first request (most common on S3000 / S80 paths).
+enforce_eager: false
+
+# Maximum number of sequences in a batch (default: 256).
+# Reduce on lower-memory cards: 128 on 24 GB cards, 64 on 16 GB cards.
+max_num_seqs: 256
+
+# Fraction of MUSA HBM reserved for the KV cache (default: 0.85). Reduce if
+# you hit OOM; the vLLM flag is named gpu_memory_utilization but applies to
+# MUSA HBM via torchada.
+gpu_memory_utilization: 0.85
+
+# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs().
+# Unknown keys are dropped automatically with a warning, so this is safe to
+# use across vLLM 0.10.x / 0.13.x.
+# engine_kwargs:
+#   swap_space: 8
+#   max_seq_len_to_capture: 4096
+
+# ── Suite-specific overrides ───────────────────────────────────────────────────
+
+suites:
+  suite_D:
+    # Long-context — reduce batch size and reserve more memory.
+    max_num_seqs: 32
+    gpu_memory_utilization: 0.80
+
+  suite_F:
+    max_num_seqs: 128
+
+# ── Speculative decoding (suite_A / suite_D extra scenario) ─────────────────
+# Uncomment to enable. vllm-musa accepts the same speculative_config dict as
+# upstream vLLM; the runner translates flat keys (speculative_model,
+# num_speculative_tokens, ...) into speculative_config automatically.
+#
+# suites:
+#   suite_A:
+#     engine_kwargs:
+#       speculative_model: "meta-llama/Llama-3.2-1B-Instruct"
+#       num_speculative_tokens: 4
+#       speculative_draft_tensor_parallel_size: 1
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json
new file mode 100644
index 0000000..7242234
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.07,
+  "baseline_delta": -0.53,
+  "valid": false,
+  "framework": "vllm-musa",
+  "precision": "BF16",
+  "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json
new file mode 100644
index 0000000..4244ef7
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json
@@ -0,0 +1,48 @@
+{
+  "collected_at": "2026-05-18T09:21:31.092840+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "MTT S4000",
+      "vendor": "Moore Threads",
+      "memory_gb": 48.0,
+      "driver_version": "2.7.0",
+      "firmware_version": null,
+      "supports_bf16": true
+    }
+  ],
+  "accelerator_platform": "moorethreads",
+  "accelerator_topology": null,
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6430",
+    "physical_cores": 64,
+    "logical_cores": 128,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 1007.5,
+  "pcie_generation": "PCIe 16x/16x",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": [
+    {
+      "name": "mlx5_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_1",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_bond_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    }
+  ],
+  "os": "Ubuntu Jammy Jellyfish (development branch)",
+  "python_version": "3.10.8",
+  "kernel_version": "5.15.0-105-generic",
+  "runtime_version": "Moore Threads Driver 2.7.0",
+  "pytorch_version": "2.2.0"
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json
new file mode 100644
index 0000000..a050fe4
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json
@@ -0,0 +1,164 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:21:31.092840+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 332.62,
+          "throughput_tokens_per_sec_per_chip": 332.62,
+          "throughput_tokens_per_sec_total": 922.83,
+          "elapsed_seconds_median": 43.4,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 331.64,
+          "throughput_tokens_per_sec_per_chip": 331.64,
+          "throughput_tokens_per_sec_total": 920.1,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 331.76,
+          "throughput_tokens_per_sec_per_chip": 331.76,
+          "throughput_tokens_per_sec_total": 920.46,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "17:34:52",
+    "run_id": "cabb7bd0",
+    "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00",
+    "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00",
+    "benchmark_elapsed_minutes": 8.7,
+    "model_load_seconds": 116.8
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json
new file mode 100644
index 0000000..064d6b8
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json
@@ -0,0 +1,163 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:21:31.092840+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "online",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 5,
+      "results_by_qps": [
+        {
+          "target_qps": 5,
+          "achieved_qps": 5.0,
+          "ttft_ms_p50": 194.45,
+          "ttft_ms_p90": 315.05,
+          "ttft_ms_p99": 424.55,
+          "tpot_ms_p50": 201.93,
+          "tpot_ms_p90": 253.8,
+          "tpot_ms_p99": 471.28,
+          "elapsed_seconds_median": 137.6,
+          "sla_met": true
+        },
+        {
+          "target_qps": 25,
+          "achieved_qps": 25.0,
+          "ttft_ms_p50": 4796.14,
+          "ttft_ms_p90": 8459.18,
+          "ttft_ms_p99": 9348.86,
+          "tpot_ms_p50": 355.01,
+          "tpot_ms_p90": 6430.04,
+          "tpot_ms_p99": 15579.83,
+          "elapsed_seconds_median": 93.0,
+          "sla_met": false
+        },
+        {
+          "target_qps": 100,
+          "achieved_qps": 100.0,
+          "ttft_ms_p50": 10354.27,
+          "ttft_ms_p90": 17651.16,
+          "ttft_ms_p99": 19078.89,
+          "tpot_ms_p50": 849.82,
+          "tpot_ms_p90": 8677.79,
+          "tpot_ms_p99": 14281.03,
+          "elapsed_seconds_median": 90.0,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "17:53:38",
+    "run_id": "cabb7bd0",
+    "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:37:13.745117+00:00",
+    "benchmark_end_time": "2026-05-18T09:53:38.865501+00:00",
+    "benchmark_elapsed_minutes": 16.4,
+    "model_load_seconds": 122.7
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json
new file mode 100644
index 0000000..e4b1093
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json
@@ -0,0 +1,215 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:21:31.092840+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenarios_run": [
+      "offline",
+      "online"
+    ],
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "num_runs": 3,
+    "extra_config": null
+  },
+  "metrics": {
+    "derived": {},
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 332.62,
+          "throughput_tokens_per_sec_per_chip": 332.62,
+          "throughput_tokens_per_sec_total": 922.83,
+          "elapsed_seconds_median": 43.4,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 331.64,
+          "throughput_tokens_per_sec_per_chip": 331.64,
+          "throughput_tokens_per_sec_total": 920.1,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 331.76,
+          "throughput_tokens_per_sec_per_chip": 331.76,
+          "throughput_tokens_per_sec_total": 920.46,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    },
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 5,
+      "results_by_qps": [
+        {
+          "target_qps": 5,
+          "achieved_qps": 5.0,
+          "ttft_ms_p50": 194.45,
+          "ttft_ms_p90": 315.05,
+          "ttft_ms_p99": 424.55,
+          "tpot_ms_p50": 201.93,
+          "tpot_ms_p90": 253.8,
+          "tpot_ms_p99": 471.28,
+          "elapsed_seconds_median": 137.6,
+          "sla_met": true
+        },
+        {
+          "target_qps": 25,
+          "achieved_qps": 25.0,
+          "ttft_ms_p50": 4796.14,
+          "ttft_ms_p90": 8459.18,
+          "ttft_ms_p99": 9348.86,
+          "tpot_ms_p50": 355.01,
+          "tpot_ms_p90": 6430.04,
+          "tpot_ms_p99": 15579.83,
+          "elapsed_seconds_median": 93.0,
+          "sla_met": false
+        },
+        {
+          "target_qps": 100,
+          "achieved_qps": 100.0,
+          "ttft_ms_p50": 10354.27,
+          "ttft_ms_p90": 17651.16,
+          "ttft_ms_p99": 19078.89,
+          "tpot_ms_p50": 849.82,
+          "tpot_ms_p90": 8677.79,
+          "tpot_ms_p99": 14281.03,
+          "elapsed_seconds_median": 90.0,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": 0.07,
+    "baseline_delta": -0.53,
+    "valid": false,
+    "framework": "vllm-musa",
+    "precision": "BF16",
+    "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "17:34:52",
+    "run_id": "cabb7bd0",
+    "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": "Partial run: ['offline', 'online'] succeeded, ['accuracy'] failed.",
+    "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00",
+    "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00",
+    "benchmark_elapsed_minutes": 25.1,
+    "model_load_seconds": 116.8,
+    "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.",
+    "scenario_dirs": {
+      "offline": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline",
+      "online": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online"
+    }
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json
new file mode 100644
index 0000000..63c6e92
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.07,
+  "baseline_delta": -0.31,
+  "valid": false,
+  "framework": "vllm-musa",
+  "precision": "BF16",
+  "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json
new file mode 100644
index 0000000..31f501b
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json
@@ -0,0 +1,48 @@
+{
+  "collected_at": "2026-05-18T08:40:55.208034+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "MTT S4000",
+      "vendor": "Moore Threads",
+      "memory_gb": 48.0,
+      "driver_version": "2.7.0",
+      "firmware_version": null,
+      "supports_bf16": true
+    }
+  ],
+  "accelerator_platform": "moorethreads",
+  "accelerator_topology": null,
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6430",
+    "physical_cores": 64,
+    "logical_cores": 128,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 1007.5,
+  "pcie_generation": "PCIe 16x/16x",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": [
+    {
+      "name": "mlx5_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_1",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_bond_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    }
+  ],
+  "os": "Ubuntu Jammy Jellyfish (development branch)",
+  "python_version": "3.10.8",
+  "kernel_version": "5.15.0-105-generic",
+  "runtime_version": "Moore Threads Driver 2.7.0",
+  "pytorch_version": "2.2.0"
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json
new file mode 100644
index 0000000..4f5ff81
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json
@@ -0,0 +1,131 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T08:40:55.208034+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "interactive",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "interactive": {
+      "ttft_ms_p50": 25.89,
+      "ttft_ms_p90": 27.18,
+      "ttft_ms_p99": 28.51,
+      "tpot_ms_p50": 14.85,
+      "tpot_ms_p90": 15.17,
+      "tpot_ms_p99": 15.5,
+      "peak_memory_gb": null,
+      "elapsed_seconds_median": 481.4
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "17:21:09",
+    "run_id": "4f66d29d",
+    "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T08:56:46.686185+00:00",
+    "benchmark_end_time": "2026-05-18T09:21:09.800661+00:00",
+    "benchmark_elapsed_minutes": 24.4,
+    "model_load_seconds": 151.2
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json
new file mode 100644
index 0000000..2498167
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json
@@ -0,0 +1,164 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T08:40:55.208034+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 4,
+          "throughput_tokens_per_sec": 1994.51,
+          "throughput_tokens_per_sec_per_chip": 1994.51,
+          "throughput_tokens_per_sec_total": 3642.41,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 16,
+          "throughput_tokens_per_sec": 1998.44,
+          "throughput_tokens_per_sec_per_chip": 1998.44,
+          "throughput_tokens_per_sec_total": 3649.59,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 64,
+          "throughput_tokens_per_sec": 2004.02,
+          "throughput_tokens_per_sec_per_chip": 2004.02,
+          "throughput_tokens_per_sec_total": 3659.77,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "16:48:27",
+    "run_id": "4f66d29d",
+    "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00",
+    "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00",
+    "benchmark_elapsed_minutes": 2.5,
+    "model_load_seconds": 146.8
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json
new file mode 100644
index 0000000..eb13372
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json
@@ -0,0 +1,151 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T08:40:55.208034+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "online",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 40,
+      "results_by_qps": [
+        {
+          "target_qps": 10,
+          "achieved_qps": 10.0,
+          "ttft_ms_p50": 47.68,
+          "ttft_ms_p90": 96.31,
+          "ttft_ms_p99": 956.22,
+          "tpot_ms_p50": 47.25,
+          "tpot_ms_p90": 80.82,
+          "tpot_ms_p99": 131.63,
+          "elapsed_seconds_median": 37.8,
+          "sla_met": false
+        },
+        {
+          "target_qps": 40,
+          "achieved_qps": 40.0,
+          "ttft_ms_p50": 94.5,
+          "ttft_ms_p90": 194.64,
+          "ttft_ms_p99": 331.88,
+          "tpot_ms_p50": 74.76,
+          "tpot_ms_p90": 287.01,
+          "tpot_ms_p99": 444.19,
+          "elapsed_seconds_median": 19.0,
+          "sla_met": true
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "16:53:54",
+    "run_id": "4f66d29d",
+    "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T08:51:01.188901+00:00",
+    "benchmark_end_time": "2026-05-18T08:53:54.250762+00:00",
+    "benchmark_elapsed_minutes": 2.9,
+    "model_load_seconds": 132.6
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json
new file mode 100644
index 0000000..a1c073d
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json
@@ -0,0 +1,215 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T08:40:55.208034+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenarios_run": [
+      "offline",
+      "online",
+      "interactive"
+    ],
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "num_runs": 3,
+    "extra_config": null
+  },
+  "metrics": {
+    "derived": {},
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 4,
+          "throughput_tokens_per_sec": 1994.51,
+          "throughput_tokens_per_sec_per_chip": 1994.51,
+          "throughput_tokens_per_sec_total": 3642.41,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 16,
+          "throughput_tokens_per_sec": 1998.44,
+          "throughput_tokens_per_sec_per_chip": 1998.44,
+          "throughput_tokens_per_sec_total": 3649.59,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 64,
+          "throughput_tokens_per_sec": 2004.02,
+          "throughput_tokens_per_sec_per_chip": 2004.02,
+          "throughput_tokens_per_sec_total": 3659.77,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    },
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 40,
+      "results_by_qps": [
+        {
+          "target_qps": 10,
+          "achieved_qps": 10.0,
+          "ttft_ms_p50": 47.68,
+          "ttft_ms_p90": 96.31,
+          "ttft_ms_p99": 956.22,
+          "tpot_ms_p50": 47.25,
+          "tpot_ms_p90": 80.82,
+          "tpot_ms_p99": 131.63,
+          "elapsed_seconds_median": 37.8,
+          "sla_met": false
+        },
+        {
+          "target_qps": 40,
+          "achieved_qps": 40.0,
+          "ttft_ms_p50": 94.5,
+          "ttft_ms_p90": 194.64,
+          "ttft_ms_p99": 331.88,
+          "tpot_ms_p50": 74.76,
+          "tpot_ms_p90": 287.01,
+          "tpot_ms_p99": 444.19,
+          "elapsed_seconds_median": 19.0,
+          "sla_met": true
+        }
+      ]
+    },
+    "interactive": {
+      "ttft_ms_p50": 25.89,
+      "ttft_ms_p90": 27.18,
+      "ttft_ms_p99": 28.51,
+      "tpot_ms_p50": 14.85,
+      "tpot_ms_p90": 15.17,
+      "tpot_ms_p99": 15.5,
+      "peak_memory_gb": null,
+      "elapsed_seconds_median": 481.4
+    }
+  },
+  "accuracy": {
+    "subset_score": 0.07,
+    "baseline_delta": -0.31,
+    "valid": false,
+    "framework": "vllm-musa",
+    "precision": "BF16",
+    "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "16:48:27",
+    "run_id": "4f66d29d",
+    "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": "Partial run: ['offline', 'online', 'interactive'] succeeded, ['accuracy'] failed.",
+    "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00",
+    "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00",
+    "benchmark_elapsed_minutes": 29.8,
+    "model_load_seconds": 146.8,
+    "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.",
+    "scenario_dirs": {
+      "offline": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline",
+      "online": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online",
+      "interactive": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive"
+    }
+  }
+}
\ No newline at end of file
diff --git a/runners/README.md b/runners/README.md
index 95290aa..aaf4d81 100644
--- a/runners/README.md
+++ b/runners/README.md
@@ -252,7 +252,7 @@ nvidia_trtllm_fp8_8d2f1a4b
 amd_vllm_rocm_7b2e1d8f
 ascend_mindie_9c4a3f11
 apple_mlx_b3e21f09
-moorethreads_vllm_musa_57ff5443
+moorethreads_vllm_musa_f2f6f965
 ```
 
 ---
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/README.md b/runners/moorethreads_vllm_musa_f2f6f965/README.md
new file mode 100644
index 0000000..5111bdc
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/README.md
@@ -0,0 +1,145 @@
+# moorethreads_vllm_musa_f2f6f965 — Moore Threads MUSA Runner (vllm-musa)
+
+AccelMark runner for Moore Threads MUSA GPUs using
+[vllm-musa](https://github.com/MooreThreads/vllm-musa).
+
+## Supported suites
+
+| Suite | Description | Notes |
+|-------|-------------|-------|
+| Suite A | Single-chip, Llama-3-8B | Validated on S4000 (default: accuracy/offline/online) |
+| Suite B | Multi-chip, Llama-3-70B | MCCL tensor parallelism; set `VLLM_WORKER_MULTIPROC_METHOD=spawn` |
+| Suite C | Quantization, Llama-3.1-8B | FP8 skipped (not supported); W8A8/W8A16 via compressed-tensors |
+| Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` / `gpu_memory_utilization` in runner config |
+| Suite E | Multi-chip scaling, Llama-3-8B | MCCL tensor parallelism |
+| Suite F | Edge, Qwen2.5-0.5B | Validated on MTT S4000 (community result in repo) |
+| Suite G | MoE multi-chip, Mixtral-8x7B | Unsupported |
+
+## Hardware compatibility
+
+| GPU | BF16 / FP16 | Multi-chip TP | FP8 | Notes |
+|-----|-------------|---------------|-----|-------|
+| MTT S4000 / S5000 | ✅ (BF16 → float16 on vLLM &lt; 0.10) | ✅ (MCCL) | ❌ | Tested with vLLM 0.4.x+musa |
+| MTT S3000 / S80 | ✅ | ✅ | ❌ | May need `--enforce-eager` on Triton errors |
+
+FP8 is excluded — not supported on this runner. FP32 inference fails with
+FlashAttention on MUSA (use FP16 or BF16). Qwen3 requires a newer vLLM + MUSA port
+(Qwen2.5 / Llama-3 work on 0.4.x).
+
+## Prerequisites
+
+Install in this order — **do not** `pip install torch` or `vllm` from PyPI on a
+bare Linux host:
+
+**1. MUSA toolkit + driver**
+
+<https://developer.mthreads.com/musa/>
+
+**2. vllm-musa (official build)**
+
+| Resource | URL |
+|----------|-----|
+| Repository | <https://github.com/MooreThreads/vllm-musa> |
+| Build guide | [README_vllm_musa.md](https://github.com/MooreThreads/vllm-musa/blob/main/README_vllm_musa.md) |
+| PyTorch MUSA | <https://github.com/MooreThreads/torch_musa> |
+
+```bash
+git clone https://github.com/MooreThreads/vllm-musa.git
+cd vllm-musa
+bash build_musa.sh
+python -c "from vllm import LLM; print('vllm ok')"
+```
+
+**3. Runner dependencies**
+
+```bash
+pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
+```
+
+Pin `transformers` to **4.40–4.46** (not 5.x) when on vLLM 0.4.x.
+
+**Environment variables**
+
+```bash
+export MUSA_VISIBLE_DEVICES=0
+export VLLM_WORKER_MULTIPROC_METHOD=spawn   # when tensor_parallel_size > 1
+```
+
+## Smoke test
+
+```bash
+python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model
+```
+
+## Accuracy
+
+AccelMark runs an integrated MMLU subset after each benchmark using the **same**
+vLLM instance as the perf run. The runner sets `device=musa`, dtype, and
+tokenizer correctly; low scores on vLLM **0.4.x+musa** reflect broken generation
+in that stack, not missing AccelMark wiring.
+
+| Model | Suite | Measured | Baseline |
+|-------|-------|----------|----------|
+| Qwen2.5-0.5B-Instruct | F | **~0.07** | 0.37 (FP16) / 0.38 (BF16) |
+| Llama-3-8B-Instruct | A | **~0.07** | 0.60 (BF16) |
+
+Throughput completes normally; answers are effectively random (repetition, system
+prompt regurgitation, similar ~7% across different models).
+
+While accuracy is broken on 0.4.x, use `--skip-accuracy-gate` to finish a perf run:
+
+```bash
+python run.py --runner moorethreads_vllm_musa_f2f6f965 \
+  --suite suite_F --precision FP16 --skip-accuracy-gate
+```
+
+Likely fix: upgrade to vllm-musa aligned with vLLM **0.10+**, keep
+`transformers` 4.40–4.46 on legacy forks, then re-run without
+`--skip-accuracy-gate`.
+
+## Usage
+
+```bash
+python run.py --runner moorethreads_vllm_musa_f2f6f965 --suite suite_F --precision FP16
+
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+python run.py --runner moorethreads_vllm_musa_f2f6f965 \
+  --suite suite_B --tensor-parallel-size 8
+```
+
+Optional runner config (copy and edit):
+
+```bash
+cp configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example \
+   configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml
+```
+
+| Field | Default | Notes |
+|-------|---------|-------|
+| `tensor_parallel_size` | 1 | MCCL tensor parallelism |
+| `enforce_eager` | false | Only if Triton / graph capture errors |
+| `max_num_seqs` | 256 | Lower on small HBM |
+| `gpu_memory_utilization` | 0.85 | Lower if OOM |
+
+## Troubleshooting
+
+| Symptom | Fix |
+|---------|-----|
+| `GLIBCXX_3.4.30` on import | Import `torch` before `transformers` (runner and smoke test do this) |
+| `KeyError: 'type'` in rope_scaling | Pin `transformers==4.46.3` (not 5.x) |
+| `Expected musa device, got cuda:0` | Use this runner (`device="musa"`) |
+| MMLU ~0.07 | See [Accuracy](#accuracy); `--skip-accuracy-gate` for perf-only runs |
+| OOM | Lower `gpu_memory_utilization` / `max_num_seqs` |
+| Triton / graph errors | `--enforce-eager` or `enforce_eager: true` in runner YAML |
+
+## Requirements
+
+See `requirements.txt` for AccelMark extras. vLLM, torch_musa, and the MUSA
+driver are installed per the official vllm-musa guide above (not from this file).
+
+Minimum environment:
+
+- Moore Threads GPU with MUSA driver
+- Python 3.10+
+- vllm-musa build per [MooreThreads/vllm-musa](https://github.com/MooreThreads/vllm-musa)
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/meta.json b/runners/moorethreads_vllm_musa_f2f6f965/meta.json
new file mode 100644
index 0000000..e57d72d
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/meta.json
@@ -0,0 +1,21 @@
+{
+  "id": "moorethreads_vllm_musa_f2f6f965",
+  "platform": "moorethreads",
+  "name": "vllm-musa on Moore Threads MUSA GPU",
+  "framework": "vllm-musa",
+  "submitted_by": "JuhaoLiang1997",
+  "description": "AccelMark runner for Moore Threads MUSA GPUs using vllm-musa (https://github.com/MooreThreads/vllm-musa). Install torch/vllm per upstream README_vllm_musa.md; requirements.txt adds benchmark deps only. Sets device=musa; BF16 maps to float16 on vLLM <0.10. MCCL tensor parallelism. FP8 unsupported.",
+  "supersedes_chain": [],
+  "notes": "Smoke-tested on MTT S4000 (vLLM 0.4.2+musa): Suite A and F default scenarios run. MMLU not at baseline — see runner README.",
+  "created": "2026-05-18",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "validated",
+    "B": "pending",
+    "C": "pending",
+    "D": "pending",
+    "E": "pending",
+    "F": "validated",
+    "G": "unsupported"
+  }
+}
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
new file mode 100644
index 0000000..1fe16ee
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
@@ -0,0 +1,22 @@
+# AccelMark — moorethreads_vllm_musa_f2f6f965
+#
+# AccelMark benchmark dependencies only. Install MUSA toolkit, torch_musa, and
+# vllm-musa first — see README.md and https://github.com/MooreThreads/vllm-musa
+#
+#   pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
+#   python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+
+# AccelMark / loadgen
+numpy==1.26.4
+jsonschema==4.25.1
+psutil==7.1.0
+tqdm==4.67.1
+aiohttp==3.12.15
+PyYAML==6.0.2
+
+# Tokenizer / config (pin to match vLLM 0.4.x — see README)
+transformers>=4.43.0,<4.47.0
+tokenizers>=0.20.0,<0.21.0
+huggingface-hub>=0.26.0,<0.27.0
+accelerate>=1.2.0,<1.3.0
+safetensors>=0.4.5,<0.5.0
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/runner.py b/runners/moorethreads_vllm_musa_f2f6f965/runner.py
new file mode 100644
index 0000000..b693369
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/runner.py
@@ -0,0 +1,440 @@
+"""
+AccelMark — Moore Threads MUSA vLLM benchmark runner (vllm-musa).
+
+Implements BenchmarkRunner for vllm-musa on Moore Threads MUSA GPUs.
+See README.md in this folder for install and hardware notes.
+"""
+
+import asyncio
+import gc
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(_REPO_ROOT))
+
+from runners.benchmark_runner import BenchmarkRunner, InferenceRequest
+from loadgen.types import InferenceResult
+
+import logging
+logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING)
+logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING)
+
+
+class MoorethreadsVLLMMUSARunner(BenchmarkRunner):
+    """vLLM on Moore Threads MUSA via vllm-musa."""
+
+    SUPPORTS_STREAMING = True
+    SUPPORTS_BATCHING = True
+    SUPPORTS_ONLINE = True
+    SUPPORTS_MULTI_CHIP = True
+
+    SUPPORTED_PRECISIONS = ["bf16", "fp16"]
+    SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors"]
+
+    _musa_runtime_prepared = False
+
+    def __init__(self):
+        self.llm = None
+        self.engine = None
+        self.tokenizer = None
+        self.sampling_params = None
+        self._loop: asyncio.AbstractEventLoop = None
+
+    def _get_chip_count(self) -> int:
+        try:
+            import pymtml
+            pymtml.mtmlInit()
+            try:
+                n = pymtml.mtmlDeviceGetCount()
+            finally:
+                try:
+                    pymtml.mtmlShutdown()
+                except Exception:
+                    pass
+            if n and n > 0:
+                return int(n)
+        except Exception:
+            pass
+        try:
+            import torch
+            n = torch.cuda.device_count()
+            return n if n > 0 else 1
+        except Exception:
+            return 1
+
+    def _get_framework_name(self) -> str:
+        return "vllm-musa"
+
+    def _get_framework_version(self) -> str:
+        plugin_version = "unknown"
+        try:
+            from importlib.metadata import version
+            plugin_version = version("vllm-musa")
+        except Exception:
+            try:
+                import vllm_musa_platform  # type: ignore
+                plugin_version = getattr(vllm_musa_platform, "__version__", "unknown")
+            except Exception:
+                pass
+        try:
+            import vllm
+            core_version = vllm.__version__
+        except Exception:
+            core_version = "unknown"
+        if plugin_version == "unknown" and core_version == "unknown":
+            return "unknown"
+        if plugin_version == "unknown":
+            return core_version
+        return f"{plugin_version}+vllm-{core_version}"
+
+    def get_model_format(self) -> str:
+        return "HuggingFace original"
+
+    @classmethod
+    def _prepare_musa_runtime(cls) -> None:
+        if cls._musa_runtime_prepared:
+            return
+        import torch  # noqa: F401
+        cls._musa_runtime_prepared = True
+
+    @staticmethod
+    def _legacy_vllm_musa() -> bool:
+        try:
+            import vllm
+            ver = vllm.__version__.split("+")[0]
+            major, minor = (int(x) for x in ver.split(".")[:2])
+            return (major, minor) < (0, 10)
+        except Exception:
+            return True
+
+    @staticmethod
+    def _get_engine_arg_fields() -> set[str]:
+        try:
+            import dataclasses
+            from vllm.engine.arg_utils import EngineArgs
+            return {f.name for f in dataclasses.fields(EngineArgs)}
+        except Exception:
+            return set()
+
+    def _resolve_musa_dtype(self, dtype: str, precision: str) -> str:
+        if not self._legacy_vllm_musa():
+            return dtype
+        if dtype in ("bfloat16", "auto") or precision.upper() == "BF16":
+            if dtype != "float16":
+                print("  Note: vLLM 0.4.x+musa — using float16")
+            return "float16"
+        return dtype
+
+    def load_model(self, model_path: str, parallelism: dict) -> None:
+        self._prepare_musa_runtime()
+
+        from transformers import AutoTokenizer
+        from vllm import LLM, AsyncLLMEngine, SamplingParams
+        from vllm.engine.arg_utils import AsyncEngineArgs
+
+        tp_size = parallelism["tensor_parallel_size"]
+        pp_size = parallelism["pipeline_parallel_size"]
+        ep_size = parallelism.get("expert_parallel_size", 1)
+        assert pp_size <= 1, (
+            "Pipeline parallelism is not supported. Use --tensor-parallel-size."
+        )
+
+        max_tokens = parallelism["max_tokens"]
+        max_model_len = parallelism["max_model_len"]
+        use_async = parallelism["use_async"]
+        enforce_eager = getattr(self, "_enforce_eager", False)
+
+        cfg = getattr(self, "_runner_config", {})
+        max_num_seqs = cfg.get("max_num_seqs", 256)
+        musa_memory_util = cfg.get("gpu_memory_utilization", 0.85)
+        extra_kwargs = dict(cfg.get("engine_kwargs") or {})
+
+        _valid_engine_fields = self._get_engine_arg_fields()
+        if _valid_engine_fields:
+            _dropped = {k: v for k, v in extra_kwargs.items()
+                        if k not in _valid_engine_fields}
+            if _dropped:
+                print(f"  Warning: engine_kwargs keys not supported by this "
+                      f"vllm-musa / vLLM version and will be ignored: "
+                      f"{list(_dropped)}")
+            extra_kwargs = {k: v for k, v in extra_kwargs.items()
+                            if k in _valid_engine_fields}
+
+        effective_precision = getattr(self, "_effective_precision", "BF16").upper()
+        precision = getattr(self, "_precision", None) or effective_precision
+        _dtype_override = getattr(self, "_precision_dtype_override", None)
+        _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {})
+        quantization = _prec_eng_kwargs.pop("quantization", None)
+
+        _NATIVE_DTYPE_MAP = {"BF16": "bfloat16", "FP16": "float16", "FP32": "float32"}
+        dtype = _NATIVE_DTYPE_MAP.get(precision, "auto")
+        self._quantization_method = quantization
+
+        if _dtype_override:
+            dtype = _dtype_override
+        dtype = self._resolve_musa_dtype(dtype, precision)
+        if _prec_eng_kwargs:
+            _prec_eng_kwargs.update(extra_kwargs)
+            extra_kwargs = _prec_eng_kwargs
+
+        if "speculative_model" in extra_kwargs and "speculative_config" not in extra_kwargs:
+            extra_kwargs["speculative_config"] = {
+                "model": extra_kwargs.pop("speculative_model"),
+                "num_speculative_tokens": extra_kwargs.pop("num_speculative_tokens", 4),
+                "draft_tensor_parallel_size": extra_kwargs.pop(
+                    "speculative_draft_tensor_parallel_size", 1
+                ),
+            }
+
+        print(
+            f"Loading model: precision={precision}, dtype={dtype}"
+            + (f", quantization_method={self._quantization_method}"
+               if self._quantization_method else "")
+        )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=False
+        )
+        self.sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
+
+        base_kwargs = dict(
+            model=model_path,
+            dtype=dtype,
+            tensor_parallel_size=tp_size,
+            trust_remote_code=False,
+            enforce_eager=enforce_eager,
+        )
+        if not _valid_engine_fields or "device" in _valid_engine_fields:
+            base_kwargs["device"] = "musa"
+        if ep_size > 1:
+            base_kwargs["enable_expert_parallel"] = True
+        if quantization:
+            base_kwargs["quantization"] = quantization
+        if max_model_len:
+            base_kwargs["max_model_len"] = max_model_len
+
+        if not use_async:
+            self.llm = LLM(**{
+                **base_kwargs,
+                "max_num_seqs": max_num_seqs,
+                "gpu_memory_utilization": musa_memory_util,
+                **extra_kwargs,
+            })
+        else:
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+            engine_args = AsyncEngineArgs(**{
+                **base_kwargs,
+                "gpu_memory_utilization": musa_memory_util,
+                **extra_kwargs,
+            })
+            self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    def get_effective_dtype(self) -> Optional[str]:
+        try:
+            if self.llm is not None:
+                return str(self.llm.llm_engine.model_config.dtype).replace("torch.", "")
+            if self.engine is not None:
+                return str(self.engine.engine.model_config.dtype).replace("torch.", "")
+        except Exception:
+            pass
+        return getattr(self, "_effective_dtype", None)
+
+    def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]:
+        formatted = [self._format_prompt(r.prompt) for r in requests]
+        t_start = time.perf_counter()
+        outputs = self.llm.generate(formatted, self.sampling_params)
+        elapsed = time.perf_counter() - t_start
+
+        self._last_accuracy_outputs = [o.outputs[0].text for o in outputs]
+
+        return [
+            InferenceResult(
+                first_token_time_ms=None,
+                total_time_ms=elapsed * 1000,
+                output_tokens=len(o.outputs[0].token_ids),
+                input_tokens=len(o.prompt_token_ids),
+                success=True,
+                output_text=o.outputs[0].text,
+            )
+            for o in outputs
+        ]
+
+    async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult:
+        from vllm.utils import random_uuid
+
+        formatted = self._format_prompt(request.prompt)
+        request_id = random_uuid()
+        t_start = time.perf_counter()
+        first_token_time_ms = None
+        output_tokens = 0
+        output_text = ""
+
+        async for output in self.engine.generate(
+            formatted, self.sampling_params, request_id
+        ):
+            if first_token_time_ms is None and len(output.outputs[0].token_ids) > 0:
+                first_token_time_ms = (time.perf_counter() - t_start) * 1000
+            output_tokens = len(output.outputs[0].token_ids)
+            output_text = output.outputs[0].text
+
+        return InferenceResult(
+            first_token_time_ms=first_token_time_ms,
+            total_time_ms=(time.perf_counter() - t_start) * 1000,
+            output_tokens=output_tokens,
+            input_tokens=0,
+            success=True,
+            output_text=output_text,
+        )
+
+    async def inference_fn_token_stream(self, request: InferenceRequest):
+        from vllm.utils import random_uuid
+
+        formatted = self._format_prompt(request.prompt)
+        request_id = random_uuid()
+        prev_length = 0
+
+        async for output in self.engine.generate(
+            formatted, self.sampling_params, request_id
+        ):
+            current_text = output.outputs[0].text
+            delta = current_text[prev_length:]
+            if delta:
+                yield delta
+                prev_length = len(current_text)
+
+    def get_peak_memory_gb(self) -> Optional[float]:
+        try:
+            import torch
+            return torch.cuda.max_memory_allocated() / (1024 ** 3)
+        except Exception:
+            pass
+        try:
+            import pymtml
+            pymtml.mtmlInit()
+            try:
+                dev = pymtml.mtmlDeviceGetByIndex(0)
+                info = pymtml.mtmlDeviceGetMemoryInfo(dev)
+                used = getattr(info, "used", None)
+                if used is not None:
+                    return float(used) / (1024 ** 3)
+            finally:
+                try:
+                    pymtml.mtmlShutdown()
+                except Exception:
+                    pass
+        except Exception:
+            pass
+        return None
+
+    def release_resources(self) -> None:
+        if self.llm is not None:
+            try:
+                del self.llm
+            except Exception:
+                pass
+            self.llm = None
+
+        if self.engine is not None:
+            try:
+                if self._loop and not self._loop.is_closed():
+                    self._loop.run_until_complete(self.engine.shutdown())
+            except Exception:
+                pass
+            try:
+                del self.engine
+            except Exception:
+                pass
+            self.engine = None
+
+        try:
+            from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+            cleanup_dist_env_and_memory(shutdown_ray=False)
+        except Exception:
+            try:
+                from vllm.distributed.parallel_state import (
+                    destroy_model_parallel,
+                    destroy_distributed_environment,
+                )
+                destroy_model_parallel()
+                destroy_distributed_environment()
+            except Exception:
+                pass
+
+        try:
+            import torch
+            if torch.distributed.is_initialized():
+                torch.distributed.destroy_process_group()
+        except Exception:
+            pass
+
+        gc.collect()
+
+        try:
+            import torch
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+        except Exception:
+            pass
+
+    def parse_args(self):
+        """Add vllm-musa-specific CLI flags. Base class pre-loads runner config."""
+        args = super().parse_args()
+        cfg = self._runner_config
+
+        import argparse
+        parser = argparse.ArgumentParser(add_help=False)
+        parser.add_argument("--tensor-parallel-size", type=int, default=None,
+                            dest="tensor_parallel_size")
+        parser.add_argument("--expert-parallel-size", type=int, default=None,
+                            dest="expert_parallel_size")
+        parser.add_argument("--enforce-eager", action="store_true", default=False,
+                            dest="enforce_eager")
+        extra, _ = parser.parse_known_args()
+
+        tp_size, _tp_source = self._resolve_tensor_parallel_size(
+            extra.tensor_parallel_size
+        )
+        ep_size = (extra.expert_parallel_size
+                   if extra.expert_parallel_size is not None
+                   else cfg.get("expert_parallel_size", 1))
+        self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False)
+
+        print(f"  tensor_parallel_size = {tp_size}  [{_tp_source}]")
+        if ep_size > 1:
+            print(f"  expert_parallel_size = {ep_size}  [cli/yaml]")
+
+        if not self.SUPPORTS_MULTI_CHIP and tp_size > 1:
+            print(f"Warning: {self.__class__.__name__} does not support multi-chip. "
+                  f"Ignoring tensor_parallel_size={tp_size}, using 1.")
+            tp_size = 1
+            ep_size = 1
+
+        self._parallelism = {
+            "tensor_parallel_size": tp_size,
+            "pipeline_parallel_size": 1,
+            "expert_parallel_size": ep_size,
+            "data_parallel_size": 1,
+        }
+        self._chip_count = tp_size
+        self._precision = getattr(args, "precision", None)
+        return args
+
+    def get_extra_subprocess_args(self, args) -> list[str]:
+        extra = [
+            "--tensor-parallel-size",
+            str(self._parallelism.get("tensor_parallel_size", 1)),
+        ]
+        if self._parallelism.get("expert_parallel_size", 1) > 1:
+            extra += ["--expert-parallel-size",
+                      str(self._parallelism["expert_parallel_size"])]
+        if self._enforce_eager:
+            extra += ["--enforce-eager"]
+        return extra
+
+
+if __name__ == "__main__":
+    MoorethreadsVLLMMUSARunner().main()
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
new file mode 100644
index 0000000..86cbbf9
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""
+Standalone vllm-musa smoke test (does not use the AccelMark runner).
+
+Usage (from repo root):
+
+    python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+    python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model
+
+    MODEL_PATH=/path/to/Qwen2.5-0.5B-Instruct \\
+    python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+"""
+
+from __future__ import annotations
+
+import gc
+import os
+import sys
+import time
+
+import torch  # noqa: F401 — before transformers/vllm (libstdc++ load order)
+
+from vllm import LLM, SamplingParams
+
+_DEFAULT_MODEL = os.getenv("MODEL_PATH", "Qwen/Qwen2.5-0.5B-Instruct")
+
+PROMPTS = [
+    "The capital of France is",
+    "Say hello in one short sentence.",
+]
+
+
+def main() -> int:
+    model_path = sys.argv[1] if len(sys.argv) > 1 else _DEFAULT_MODEL
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=64)
+
+    print(f"Loading {model_path} ...")
+    t_load = time.perf_counter()
+    llm = LLM(
+        model=model_path,
+        device="musa",
+        dtype="float16",
+        tensor_parallel_size=1,
+        max_model_len=1024,
+        max_num_seqs=4,
+        gpu_memory_utilization=0.85,
+        trust_remote_code=False,
+    )
+    print(f"Model loaded in {time.perf_counter() - t_load:.1f}s\n")
+
+    t_infer = time.perf_counter()
+    outputs = llm.generate(PROMPTS, sampling_params)
+    print(f"Inference done in {time.perf_counter() - t_infer:.1f}s\n")
+
+    for prompt, output in zip(PROMPTS, outputs):
+        text = output.outputs[0].text
+        n_tokens = len(output.outputs[0].token_ids)
+        print(f"Prompt:  {prompt!r}")
+        print(f"Output:  {text!r}")
+        print(f"Tokens:  {n_tokens}\n")
+
+    del llm
+    gc.collect()
+    try:
+        if hasattr(torch, "musa"):
+            torch.musa.empty_cache()
+        else:
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
+    print("Done.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/runners/platforms/moorethreads.py b/runners/platforms/moorethreads.py
index 708db1b..9f55684 100644
--- a/runners/platforms/moorethreads.py
+++ b/runners/platforms/moorethreads.py
@@ -1,17 +1,13 @@
 """Moore Threads MUSA GPU platform plug-in.
 
-Moore Threads ships its own driver and management tooling:
-
-* ``mthreads-gmi`` — the moral equivalent of ``nvidia-smi`` / ``rocm-smi``.
-* ``pymtml`` — Python bindings analogous to NVML / pynvml.
-* ``torchada`` — a CUDA→MUSA compatibility shim that exposes the standard
-  ``torch.cuda`` API, with the real backend version available via
-  ``torch.version.musa``.
-
-This plug-in first tries the Python bindings (best machine-readable
-output) and falls back to scraping ``mthreads-gmi`` text output. Both
-paths are best-effort: when none of the tools are installed the plug-in
-silently reports zero accelerators and the collector moves on.
+Used by ``runners/collect_env.py`` to populate ``env_info.json``.
+
+Detection order (first non-empty wins):
+
+  1. ``pymtml`` (mthreads-ml-py) — same API as used in the vllm-musa runner
+  2. ``mthreads-gmi`` text output
+  3. ``torch`` device properties (``torch.cuda`` aliased to MUSA via torchada,
+     or native ``torch.musa`` when available)
 """
 from __future__ import annotations
 
@@ -23,8 +19,6 @@
 VENDOR_LABEL = "Moore Threads"
 PRIORITY = 60
 
-# S5000 / S4000 datacenter SKUs ship with native BF16 support; the older
-# consumer-class MTT S80/S70 cards are FP16-only.
 _BF16_SUPPORTED_HINTS = ("s5000", "s4000", "s3000")
 _NO_BF16_HINTS = ("s80", "s70", "s60", "s50")
 
@@ -40,50 +34,68 @@ def _supports_bf16(chip_name: str) -> bool:
     return True
 
 
+def _driver_version_from_smi() -> str | None:
+    try:
+        out = subprocess.check_output(
+            ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
+        )
+        m = re.search(r"Driver\s+Version\s*:\s*(\S+)", out, re.IGNORECASE)
+        if m:
+            return m.group(1)
+    except Exception:
+        pass
+    return None
+
+
 def _collect_via_pymtml() -> list[dict]:
     try:
-        import pymtml as mtml  # type: ignore[import-not-found]
+        import pymtml
     except ImportError:
         return []
 
     try:
-        mtml.mtmlInit()
+        pymtml.mtmlInit()
     except Exception:
         return []
 
+    driver = _driver_version_from_smi() or "unknown"
     accelerators: list[dict] = []
     try:
-        count = mtml.mtmlDeviceGetCount()
+        count = pymtml.mtmlDeviceGetCount()
     except Exception:
         try:
-            mtml.mtmlShutdown()
+            pymtml.mtmlShutdown()
         except Exception:
             pass
         return []
 
     for idx in range(int(count)):
         try:
-            handle = mtml.mtmlDeviceGetHandleByIndex(idx)
-            name = mtml.mtmlDeviceGetName(handle)
-            mem = mtml.mtmlDeviceGetMemoryInfo(handle)
-            total_mb = getattr(mem, "total", None) or mem.get("total", 0)
-            driver = mtml.mtmlSystemGetDriverVersion()
+            dev = pymtml.mtmlDeviceGetByIndex(idx)
+            name = pymtml.mtmlDeviceGetName(dev)
+            mem = pymtml.mtmlDeviceGetMemoryInfo(dev)
+            total_bytes = getattr(mem, "total", None)
+            if total_bytes is None and isinstance(mem, dict):
+                total_bytes = mem.get("total")
         except Exception:
             continue
+        if not isinstance(name, str):
+            name = name.decode("utf-8", "ignore")
+        memory_gb = round(int(total_bytes) / (1024 ** 3), 1) if total_bytes else None
         accelerators.append(
             {
                 "index": idx,
-                "name": name if isinstance(name, str) else name.decode("utf-8", "ignore"),
+                "name": name,
                 "vendor": VENDOR_LABEL,
-                "memory_gb": round(int(total_mb) / 1024, 1) if total_mb else None,
-                "driver_version": driver if isinstance(driver, str) else driver.decode("utf-8", "ignore"),
+                "memory_gb": memory_gb,
+                "driver_version": driver,
                 "firmware_version": None,
-                "supports_bf16": _supports_bf16(str(name)),
+                "supports_bf16": _supports_bf16(name),
             }
         )
 
     try:
-        mtml.mtmlShutdown()
+        pymtml.mtmlShutdown()
     except Exception:
         pass
 
@@ -91,12 +103,7 @@ def _collect_via_pymtml() -> list[dict]:
 
 
 def _collect_via_smi() -> list[dict]:
-    """Fallback parser for ``mthreads-gmi`` text output.
-
-    The output format mirrors nvidia-smi: a header with the driver / MUSA
-    versions followed by per-device blocks listing the product name and
-    memory usage. We only need the device name and total memory.
-    """
+    """Parse ``mthreads-gmi`` text output (mthreads-gmi 1.14+ tabular format)."""
     try:
         out = subprocess.check_output(
             ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
@@ -110,21 +117,18 @@ def _collect_via_smi() -> list[dict]:
         driver = m.group(1)
 
     accelerators: list[dict] = []
-    # Per-device rows look like:
-    #   |   0  MTT S4000                  ...     | 0000:65:00.0  Off |   ... |
-    # followed by:
-    #   |   0%   45C    P0    ... /   ... |    234MiB / 49152MiB |    ... |
+    # Example row:
+    #   0    MTT S4000      |00000000:28:00.0    |0%    4MiB(49152MiB)
     for match in re.finditer(
-        r"\|\s*(\d+)\s+(MTT\s+\S+(?:\s+\S+)?)\s*", out
+        r"^(\d+)\s+(MTT\s+\S+)\s+\|",
+        out,
+        re.MULTILINE,
     ):
         idx = int(match.group(1))
         name = match.group(2).strip()
-        # Search downstream of this match for the memory line
-        tail = out[match.end():]
-        mem_match = re.search(r"(\d+)MiB\s*/\s*(\d+)MiB", tail)
-        memory_gb = None
-        if mem_match:
-            memory_gb = round(int(mem_match.group(2)) / 1024, 1)
+        tail = out[match.end(): match.end() + 256]
+        mem_match = re.search(r"\d+MiB\((\d+)MiB\)", tail)
+        memory_gb = round(int(mem_match.group(1)) / 1024, 1) if mem_match else None
         accelerators.append(
             {
                 "index": idx,
@@ -139,23 +143,69 @@ def _collect_via_smi() -> list[dict]:
     return accelerators
 
 
+def _collect_via_torch() -> list[dict]:
+    """Fallback when management libraries are missing but torch MUSA is loaded."""
+    try:
+        import torch
+    except ImportError:
+        return []
+
+    driver = _driver_version_from_smi() or "unknown"
+    accelerators: list[dict] = []
+
+    if hasattr(torch, "musa"):
+        try:
+            count = torch.musa.device_count()
+            get_props = torch.musa.get_device_properties
+        except Exception:
+            count = 0
+            get_props = None
+    else:
+        try:
+            count = torch.cuda.device_count()
+            get_props = torch.cuda.get_device_properties
+        except Exception:
+            return []
+
+    for idx in range(int(count)):
+        try:
+            props = get_props(idx)
+            name = getattr(props, "name", None) or f"MTT GPU {idx}"
+            total = getattr(props, "total_memory", None)
+            memory_gb = round(total / (1024 ** 3), 1) if total else None
+        except Exception:
+            continue
+        accelerators.append(
+            {
+                "index": idx,
+                "name": name if isinstance(name, str) else str(name),
+                "vendor": VENDOR_LABEL,
+                "memory_gb": memory_gb,
+                "driver_version": driver,
+                "firmware_version": None,
+                "supports_bf16": _supports_bf16(str(name)),
+            }
+        )
+    return accelerators
+
+
 def collect() -> list[dict]:
-    accelerators = _collect_via_pymtml()
-    if accelerators:
-        return accelerators
-    return _collect_via_smi()
+    for fn in (_collect_via_pymtml, _collect_via_smi, _collect_via_torch):
+        accelerators = fn()
+        if accelerators:
+            return accelerators
+    return []
 
 
 def detect_runtime_version() -> str | None:
-    """Prefer torch.version.musa (most reliable when torchada is installed),
-    fall back to scraping ``mthreads-gmi`` header.
-    """
     try:
         import torch
 
         ver = getattr(torch.version, "musa", None)
         if ver:
             return f"MUSA {ver}"
+        if getattr(torch.version, "cuda", None):
+            return f"MUSA (torch.cuda shim) {torch.version.cuda}"
     except ImportError:
         pass
 
@@ -174,17 +224,43 @@ def detect_runtime_version() -> str | None:
     return None
 
 
+def detect_pcie_gen() -> str | None:
+    try:
+        out = subprocess.check_output(
+            ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
+        )
+        m = re.search(r"\|\s*(\d+)x\((\d+)x\)\s*\|", out)
+        if m:
+            return f"PCIe {m.group(1)}x/{m.group(2)}x"
+    except Exception:
+        pass
+    return None
+
+
+def detect_intra_node_interconnect() -> str | None:
+    """Moore Threads multi-GPU hosts typically use MCCL over PCIe."""
+    accels = collect()
+    if len(accels) > 1:
+        return "MCCL/PCIe"
+    return None
+
+
 def diagnostics(env: dict, accelerators: list[dict]) -> list[str]:
     notes: list[str] = []
-    if accelerators and (env.get("pytorch_version") or "") == "unknown":
+    if not accelerators:
+        notes.append(
+            "No Moore Threads MUSA GPUs detected (tried pymtml, mthreads-gmi, "
+            "and torch). Install the MUSA driver/toolkit per "
+            "https://github.com/MooreThreads/vllm-musa ."
+        )
+        return notes
+    if (env.get("pytorch_version") or "") == "unknown":
         notes.append(
-            "PyTorch (with the torchada MUSA shim) is not installed — "
-            "pytorch_version is unknown."
+            "PyTorch with MUSA support is not installed — pytorch_version is unknown."
         )
-    if accelerators and (env.get("runtime_version") or "") == "unknown":
+    if (env.get("runtime_version") or "") == "unknown":
         notes.append(
             "Could not detect MUSA runtime (tried torch.version.musa and "
-            "mthreads-gmi). runtime_version is unknown — install torchada "
-            "or the Moore Threads MUSA toolkit."
+            "mthreads-gmi). runtime_version is unknown."
         )
     return notes
diff --git a/schema/env.schema.json b/schema/env.schema.json
index 60fc5e8..e80cd94 100644
--- a/schema/env.schema.json
+++ b/schema/env.schema.json
@@ -16,7 +16,7 @@
         "properties": {
           "index": { "type": "integer" },
           "name": { "type": "string" },
-          "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple'" },
+          "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple', 'Moore Threads'" },
           "memory_gb": { "type": ["number","null"], "minimum": 0 },
           "driver_version": { "type": "string" },
           "firmware_version": { "type": ["string","null"] },
diff --git a/schema/result.schema.json b/schema/result.schema.json
index 99a0517..fb81a8a 100644
--- a/schema/result.schema.json
+++ b/schema/result.schema.json
@@ -36,7 +36,7 @@
         "vendor": {
           "type": "string",
           "enum": ["NVIDIA","AMD","Intel","Google","Huawei","Cambricon","Biren",
-                   "Enflame","MetaX","Iluvatar","Apple","Qualcomm","Other"]
+                   "Enflame","MetaX","Moore Threads","Iluvatar","Apple","Qualcomm","Other"]
         },
         "count": { "type": "integer", "minimum": 1 },
         "memory_gb": { "type": "number", "minimum": 0 },