From 99e4a06be3d13bd13bd340c2e43b24fcd7169406 Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Fri, 15 May 2026 11:08:00 +0800
Subject: [PATCH 1/5] =?UTF-8?q?feat:=20add=20Moore=20Threads=20MUSA=20runn?=
 =?UTF-8?q?er=20(S5000/S4000)=20=E2=80=94=20moorethreads=5Fvllm=5Fmusa=5F5?=
 =?UTF-8?q?7ff5443?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the AccelMark runner skeleton for Moore Threads MTT S5000 / S4000
GPUs via the official vllm-musa platform plugin. The plugin auto-patches
vLLM at import time (torchada CUDA→MUSA aliasing + pymtml + Triton
patches), so the standard vLLM Python API is preserved and the runner
mirrors the structure of ascend_vllm_ascend.

What is included:

* runners/moorethreads_vllm_musa_57ff5443/ — runner.py, meta.json
  (with suite_support self-declaration), requirements.txt, README.md
* configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example

The README platforms matrix updates automatically from the runner's
meta.json (no hand-editing required, thanks to the onboarding
decoupling that landed in the preceding commit). The Moore Threads
environment detector also already lives at runners/platforms/moorethreads.py
in the same earlier commit.

Notes:

* Capability flags are conservative: SUPPORTED_QUANTIZATION_BACKENDS only
  declares compressed-tensors; FP8 / AWQ / GPTQ-Marlin will be enabled in
  a follow-up runner version once real-hardware smoke tests confirm kernel
  coverage on MUSA.
* This code has not yet been validated on physical S5000 / S4000 silicon;
  all suites are marked "pending" in suite_support and smoke testing will
  land as a new runner folder with a fresh hash.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 README.md                                     |   1 +
 ...orethreads_vllm_musa_57ff5443.yaml.example |  62 ++
 .../moorethreads_vllm_musa_57ff5443/README.md | 200 ++++++
 .../moorethreads_vllm_musa_57ff5443/meta.json |  21 +
 .../requirements.txt                          |  58 ++
 .../moorethreads_vllm_musa_57ff5443/runner.py | 575 ++++++++++++++++++
 6 files changed, 917 insertions(+)
 create mode 100644 configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example
 create mode 100644 runners/moorethreads_vllm_musa_57ff5443/README.md
 create mode 100644 runners/moorethreads_vllm_musa_57ff5443/meta.json
 create mode 100644 runners/moorethreads_vllm_musa_57ff5443/requirements.txt
 create mode 100644 runners/moorethreads_vllm_musa_57ff5443/runner.py

diff --git a/README.md b/README.md
index ea9e2b6..92cec27 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
 | Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — |
+| Moore Threads GPU | `moorethreads_vllm_musa_57ff5443` | vllm-musa | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — |
 
 _Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._
 <!-- platforms-matrix:end -->
diff --git a/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example b/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example
new file mode 100644
index 0000000..5c8f878
--- /dev/null
+++ b/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example
@@ -0,0 +1,62 @@
+# AccelMark runner config — moorethreads_vllm_musa_57ff5443 (vllm-musa on Moore Threads)
+#
+# Copy this file to runner_moorethreads_vllm_musa_57ff5443.yaml (remove
+# .example suffix) and edit as needed for your hardware. The actual .yaml
+# is gitignored.
+#
+# These settings adapt the runner to your hardware environment. They are
+# recorded in result.json task.extra_config for transparency but are NOT
+# part of the benchmark identity (not hashed into run_id).
+#
+# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
+
+# ── Global defaults (apply to all suites) ─────────────────────────────────────
+
+# Tensor parallel size — number of Moore Threads GPUs to use (default: 1).
+# For multi-card runs make sure to export VLLM_WORKER_MULTIPROC_METHOD=spawn.
+tensor_parallel_size: 1
+
+# Disable Triton CUDA-graph / compilation. Set true if you hit Triton kernel
+# errors on first request (most common on S3000 / S80 paths).
+enforce_eager: false
+
+# Maximum number of sequences in a batch (default: 256).
+# Reduce on lower-memory cards: 128 on 24 GB cards, 64 on 16 GB cards.
+max_num_seqs: 256
+
+# Fraction of MUSA HBM reserved for the KV cache (default: 0.85). Reduce if
+# you hit OOM; the vLLM flag is named gpu_memory_utilization but applies to
+# MUSA HBM via torchada.
+gpu_memory_utilization: 0.85
+
+# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs().
+# Unknown keys are dropped automatically with a warning, so this is safe to
+# use across vLLM 0.10.x / 0.13.x.
+# engine_kwargs:
+#   swap_space: 8
+#   max_seq_len_to_capture: 4096
+
+# ── Suite-specific overrides ───────────────────────────────────────────────────
+
+suites:
+  suite_D:
+    # Long-context — reduce batch size and reserve more memory.
+    max_num_seqs: 32
+    gpu_memory_utilization: 0.80
+
+  suite_F:
+    # Consumer / edge GPU — enforce_eager often safer for first runs.
+    # enforce_eager: true
+    max_num_seqs: 128
+
+# ── Speculative decoding (suite_A / suite_D extra scenario) ─────────────────
+# Uncomment to enable. vllm-musa accepts the same speculative_config dict as
+# upstream vLLM; the runner translates flat keys (speculative_model,
+# num_speculative_tokens, ...) into speculative_config automatically.
+#
+# suites:
+#   suite_A:
+#     engine_kwargs:
+#       speculative_model: "meta-llama/Llama-3.2-1B-Instruct"
+#       num_speculative_tokens: 4
+#       speculative_draft_tensor_parallel_size: 1
diff --git a/runners/moorethreads_vllm_musa_57ff5443/README.md b/runners/moorethreads_vllm_musa_57ff5443/README.md
new file mode 100644
index 0000000..82411a1
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_57ff5443/README.md
@@ -0,0 +1,200 @@
+# moorethreads_vllm_musa_57ff5443 — Moore Threads MUSA Runner (vllm-musa)
+
+AccelMark runner for Moore Threads MUSA GPUs using
+[vllm-musa](https://github.com/MooreThreads/vllm-musa), the official vLLM
+platform plugin for MUSA hardware.
+
+> **Status:** This runner is **untested on real silicon at the time of
+> commit**. The code is written against the public `vllm-musa` plugin
+> documentation and follows the structural template of the
+> `ascend_vllm_ascend_*` runner. Plan to smoke-test on an S5000 / S4000
+> system; capability flags and dtype mappings may be adjusted in a follow-up
+> runner version (new hash, new folder) based on real-world findings.
+
+## How vllm-musa works
+
+`vllm-musa` is a vLLM **platform plugin** (auto-detected on `import vllm`)
+that makes the standard vLLM Python API run on Moore Threads MUSA GPUs. It
+relies on three components:
+
+| Component | Role |
+|---|---|
+| `torchada` | CUDA→MUSA compatibility layer for PyTorch — aliases `torch.cuda.*` to MUSA so most code paths run unmodified |
+| `pymtml` (`mthreads-ml-py`) | Moore Threads Management Library bindings, equivalent to `nvidia-ml-py` |
+| Triton patches | Runtime monkey-patches in `vllm_musa_platform.patches.*` that fix `triton.attention` and `worker` modules for MUSA's Triton compiler |
+
+The standard `vllm.LLM`, `vllm.AsyncLLMEngine`, and `vllm.SamplingParams`
+remain the entry points — this runner therefore reuses ~95% of the logic
+from the NVIDIA / Ascend vLLM runners.
+
+## Supported suites
+
+| Suite | Description | Notes |
+|-------|-------------|-------|
+| Suite A | Single-chip, Llama-3-8B | Pending smoke test on S4000 / S5000 |
+| Suite B | Multi-chip, Llama-3-70B | Requires multiple Moore Threads cards + MCCL TP |
+| Suite C | Quantization, Llama-3.1-8B | FP8 skipped (no native FP8 in current MUSA hardware); compressed-tensors W8A8/W8A16 candidate; AWQ / GPTQ pending validation |
+| Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` and `gpu_memory_utilization` |
+| Suite E | Multi-chip scaling, Llama-3-8B | Validates MCCL tensor parallelism |
+| Suite F | Consumer/edge, Qwen2.5-0.5B | Recommended starting point for S4000 single-card systems |
+
+## Hardware compatibility
+
+| GPU | BF16 | TP via MCCL | FP8 | Notes |
+|-----|------|-------------|-----|-------|
+| MTT S5000 | ✅ | ✅ | ❌ | Recommended public reference target (FA3 via MATE) |
+| MTT S4000 | ✅ | ✅ | ❌ | Validated path with PyTorch SDPA-based FlashAttention |
+| MTT S3000 | ⚠️ | ⚠️ | ❌ | May work via `--enforce-eager`; not the public reference |
+| MTT S80 | ⚠️ | — | ❌ | Consumer card; treat as best-effort |
+
+## Prerequisites
+
+You must install the MUSA stack in this exact order — Python packages alone
+are not sufficient:
+
+**1. MUSA toolkit + driver**
+
+Match the toolkit version to your card firmware. Reference:
+<https://developer.mthreads.com/musa/>
+
+**2. PyTorch with MUSA support (torch + torchada)**
+
+The recommended path is the official Moore Threads container, which ships a
+pre-built `torch==2.7.1` together with `torchada` and `pymtml`. See:
+
+```bash
+docker pull sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120
+```
+
+**3. Runner dependencies**
+
+Inside the MUSA container:
+
+```bash
+pip install -r runners/moorethreads_vllm_musa_57ff5443/requirements.txt
+```
+
+This installs `vllm-musa==0.1.1` which auto-pulls a validated vLLM core
+(`0.10.1.1` by default). To use vLLM `0.13.0` instead (V1-only engine):
+
+```bash
+pip install vllm==0.13.0 --no-deps --upgrade
+pip install 'depyf==0.20.0' 'llguidance>=1.3.0,<1.4.0' \
+            'lm-format-enforcer==0.11.3' 'outlines_core==0.2.11' \
+            'xgrammar==0.1.27' 'compressed-tensors==0.12.2'
+```
+
+## Required environment variables
+
+```bash
+# Device visibility (works like CUDA_VISIBLE_DEVICES)
+export MUSA_VISIBLE_DEVICES=0,1,2,3
+
+# Recommended for multi-process workers (TP > 1)
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+```
+
+## Basic usage
+
+```bash
+# Verify the plugin is loaded before running anything else
+python -c "from vllm_musa_platform import musa_platform_plugin; print('ok')"
+
+# Suite F (single-card S4000 / S5000)
+python run.py --runner moorethreads_vllm_musa_57ff5443 --suite suite_F
+
+# Suite A (single-card datacenter benchmark)
+python run.py --runner moorethreads_vllm_musa_57ff5443 --suite suite_A
+
+# Multi-card tensor parallelism (e.g. 8 x S5000 on a single host)
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+python run.py --runner moorethreads_vllm_musa_57ff5443 \
+    --suite suite_B \
+    --tensor-parallel-size 8
+
+# Local model cache
+python run.py --runner moorethreads_vllm_musa_57ff5443 \
+    --suite suite_A \
+    --model-path /data/models/Meta-Llama-3-8B-Instruct
+```
+
+## Runner config
+
+Copy the example config and adjust for your hardware:
+
+```bash
+cp configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example \
+   configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml
+```
+
+Key settings:
+
+| Field | Default | Notes |
+|-------|---------|-------|
+| `tensor_parallel_size` | 1 | Number of MUSA GPUs for tensor parallelism |
+| `enforce_eager` | false | Disable CUDA-graph / compilation; useful for pre-S4000 cards or while debugging |
+| `max_num_seqs` | 256 | Max concurrent sequences; reduce on lower-memory cards |
+| `gpu_memory_utilization` | 0.85 | Fraction of HBM reserved for KV cache; reduce if OOM |
+
+## Triton / kernel compilation errors
+
+If you encounter errors during Triton graph capture on first request,
+disable graph capture with `--enforce-eager`:
+
+```bash
+python run.py --runner moorethreads_vllm_musa_57ff5443 \
+    --suite suite_F --enforce-eager
+```
+
+Or set persistently in the runner config YAML:
+
+```yaml
+enforce_eager: true
+```
+
+## HBM OOM errors
+
+Reduce `gpu_memory_utilization` and/or `max_num_seqs`, either globally or
+per-suite (Suite D is the most memory-hungry due to long-context inputs):
+
+```yaml
+gpu_memory_utilization: 0.80
+max_num_seqs: 128
+
+suites:
+  suite_D:
+    max_num_seqs: 32
+    gpu_memory_utilization: 0.78
+```
+
+## Known gaps (pre-smoke-test)
+
+The following items are placeholders and **must be re-validated** on real
+S4000 / S5000 hardware:
+
+- **Memory peak**: relies on `torch.cuda.max_memory_allocated()` which
+  torchada aliases to MUSA. If this returns 0 or `None`, fall back to
+  `pymtml.mtmlDeviceGetMemoryInfo()`.
+- **MCCL teardown**: assumes the same `cleanup_dist_env_and_memory` entry
+  point as upstream vLLM. If MCCL leaves a hanging process group, the
+  fallback path explicitly destroys the torch.distributed group.
+- **Quantization**: `SUPPORTED_QUANTIZATION_BACKENDS` currently lists only
+  `compressed-tensors`. AWQ / GPTQ-Marlin / FP8 are intentionally excluded
+  until kernel coverage on MUSA is confirmed.
+- **Precision detection**: `_get_chip_count()` prefers `pymtml` over
+  `torch.cuda.device_count()`. On hosts where pymtml is missing this may
+  miscount; in that case the torch fallback should still work because
+  torchada provides `torch.cuda.device_count()`.
+
+## Requirements
+
+See `requirements.txt` for the pinned plugin / extras list. The heavy
+dependencies (torch + torchada + MUSA toolkit) must come from the Moore
+Threads container; do not install them from PyPI.
+
+Minimum environment:
+- Moore Threads MTT S4000 or newer (S3000 / S80 best-effort)
+- MUSA toolkit + driver matching card firmware
+- torch 2.7.1 (Moore Threads MUSA build) + torchada ≥ 0.1.9
+- Python 3.10+
+- vllm-musa 0.1.1 (vLLM core 0.10.1.1 or 0.13.0)
diff --git a/runners/moorethreads_vllm_musa_57ff5443/meta.json b/runners/moorethreads_vllm_musa_57ff5443/meta.json
new file mode 100644
index 0000000..655a6ef
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_57ff5443/meta.json
@@ -0,0 +1,21 @@
+{
+  "id": "moorethreads_vllm_musa_57ff5443",
+  "platform": "moorethreads",
+  "name": "vllm-musa on Moore Threads MUSA GPU",
+  "framework": "vllm-musa",
+  "submitted_by": "JuhaoLiang1997",
+  "description": "AccelMark runner for Moore Threads MTT S4000 / S5000 MUSA GPUs via the vllm-musa platform plugin (vLLM 0.10.x / 0.13.x + torchada CUDA→MUSA compatibility + pymtml). API-compatible with standard vLLM; MCCL-based tensor parallelism. FP8 excluded — not supported on current MUSA hardware. Quantization limited to compressed-tensors (W8A8/W8A16) pending real-hardware validation of AWQ / GPTQ / FP8 paths.",
+  "supersedes_chain": [],
+  "notes": "Initial Moore Threads runner. Written from the public vllm-musa documentation and the structural template of ascend_vllm_ascend_d4aa9fda; capability flags, dtype mapping and teardown sequence are placeholders awaiting smoke-testing on real S4000 / S5000 silicon.",
+  "created": "2026-05-15",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "pending",
+    "B": "pending",
+    "C": "pending",
+    "D": "pending",
+    "E": "pending",
+    "F": "pending",
+    "G": "unsupported"
+  }
+}
diff --git a/runners/moorethreads_vllm_musa_57ff5443/requirements.txt b/runners/moorethreads_vllm_musa_57ff5443/requirements.txt
new file mode 100644
index 0000000..2a44733
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_57ff5443/requirements.txt
@@ -0,0 +1,58 @@
+# AccelMark -- Moore Threads MUSA vllm-musa runner dependencies
+#
+# This runner is designed to run inside the official Moore Threads MUSA
+# container (which already ships torch + torchada built for the MUSA
+# toolkit) and only installs the vLLM platform plugin + accelmark extras
+# on top of it.
+#
+# Tested image (subject to change at smoke-test time):
+#   sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120
+# Reference docker command:
+#   docker run -d --net host --privileged --pid=host --shm-size 500g \
+#     -v $PWD:/ws -w /ws \
+#     --name accelmark-musa \
+#     sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120 \
+#     sleep infinity
+#   docker exec -it accelmark-musa bash
+#
+# Pre-installed in the container (do NOT reinstall via pip):
+#   torch==2.7.1              (built for MUSA with torchada)
+#   torchada>=0.1.9           (CUDA→MUSA compatibility layer)
+#   mthreads-ml-py>=2.2.5     (pymtml — MTML bindings)
+#
+# vLLM core: the plugin pulls in a compatible version automatically, but for
+# reproducibility we pin to one of the validated combinations below.
+# Pick ONE of these two stacks (uncomment the matching line in the install
+# guide in README.md):
+#
+#   stack A — vLLM 0.10.1.1 (V0 + V1 engines):
+#     pip install -e .   # plugin auto-installs vllm==0.10.1.1
+#
+#   stack B — vLLM 0.13.0 (V1-only):
+#     pip install -e .                      # plugin installs vllm==0.10.1.1
+#     pip install vllm==0.13.0 --no-deps --upgrade
+#     pip install 'depyf==0.20.0' 'llguidance>=1.3.0,<1.4.0' \
+#                 'lm-format-enforcer==0.11.3' 'outlines_core==0.2.11' \
+#                 'xgrammar==0.1.27' 'compressed-tensors==0.12.2'
+
+# vLLM MUSA platform plugin (PyPI: vllm-musa, GitHub: MooreThreads/vllm-musa)
+vllm-musa==0.1.1
+
+# Transformers stack — pin to versions compatible with vLLM 0.10.x / 0.13.x
+transformers==4.46.3
+tokenizers==0.20.3
+huggingface-hub==0.26.5
+accelerate==1.2.1
+safetensors==0.4.5
+
+# AccelMark dependencies (not bundled in the image)
+numpy==1.26.4
+jsonschema==4.25.1
+psutil==7.1.0
+tqdm==4.67.1
+
+# Async support
+aiohttp==3.12.15
+
+# Config file parsing
+PyYAML==6.0.2
diff --git a/runners/moorethreads_vllm_musa_57ff5443/runner.py b/runners/moorethreads_vllm_musa_57ff5443/runner.py
new file mode 100644
index 0000000..d753330
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_57ff5443/runner.py
@@ -0,0 +1,575 @@
+"""
+AccelMark — Moore Threads MUSA GPU benchmark runner (vllm-musa).
+
+Implements BenchmarkRunner for vLLM on Moore Threads MUSA GPUs via the
+``vllm-musa`` platform plugin. All orchestration logic lives in
+``runners/benchmark_runner.py``.
+
+The plugin works by patching vLLM at import time:
+  - ``torchada`` aliases the CUDA Python API onto MUSA
+  - ``pymtml`` (mthreads-ml-py) provides device queries equivalent to
+    nvidia-ml-py
+  - A few Triton attention/worker patches are applied to make the standard
+    vLLM kernels run on MUSA's Triton compiler.
+
+As a result, the standard vLLM Python API (``LLM``, ``AsyncLLMEngine``,
+``SamplingParams``) is fully preserved. This runner is therefore structurally
+identical to the NVIDIA / AMD / Ascend vLLM runners — the differences are
+in capability flags, device-count detection, and memory teardown.
+
+Hardware:     Moore Threads MTT S4000 / S5000 (and forward-compatible
+              successors). S3000 / S80 may also work but are not the public
+              reference target.
+Runtime:      MUSA (Meta-computing Unified System Architecture)
+Framework:    vllm-musa — https://github.com/MooreThreads/vllm-musa
+              (also published on PyPI as ``vllm-musa``)
+Precision:    BF16 (preferred on S4000+), FP16 fallback. FP8 not yet
+              supported on shipping MUSA hardware.
+Quantization: compressed-tensors (W8A8 / W8A16) declared by default. AWQ /
+              GPTQ / FP8 may be added once validated on real hardware.
+Multi-chip:   Tensor parallelism via MCCL (Moore Threads Collective
+              Communications Library). vLLM's tensor_parallel_size flag works
+              unchanged because torchada aliases the NCCL API surface.
+Streaming:    Fully supported — AsyncLLMEngine API is identical to vLLM.
+
+Installation (without a real device this is "informational"; final
+versions to be confirmed at smoke-test time):
+
+    # 1. Install the MUSA toolkit + driver matching your card firmware:
+    #    https://developer.mthreads.com/musa/
+    # 2. Install Moore Threads' PyTorch build (torch + torchada) inside the
+    #    official MUSA container, then:
+    pip install -r runners/moorethreads_vllm_musa_{hash8}/requirements.txt
+
+Usage:
+
+    # S5000 single chip
+    python run.py --runner moorethreads_vllm_musa_{hash8} --suite suite_F
+
+    # Multi-chip tensor parallelism (e.g. 8 x S5000)
+    VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    python run.py --runner moorethreads_vllm_musa_{hash8} \
+        --suite suite_B --tensor-parallel-size 8
+
+Environment variables you might want to set:
+    MUSA_VISIBLE_DEVICES        — equivalent to CUDA_VISIBLE_DEVICES
+    VLLM_WORKER_MULTIPROC_METHOD=spawn   — recommended for multi-process workers
+"""
+
+import asyncio
+import gc
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+# Add repo root to path
+_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(_REPO_ROOT))
+
+from runners.benchmark_runner import BenchmarkRunner, InferenceRequest
+from loadgen.types import InferenceResult
+
+
+import logging
+logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING)
+logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING)
+
+
+class MoorethreadsVLLMMUSARunner(BenchmarkRunner):
+    """
+    AccelMark benchmark runner using ``vllm-musa`` on Moore Threads MUSA GPUs.
+
+    ``vllm-musa`` is registered as a vLLM platform plugin and is auto-detected
+    on ``import vllm``. The plugin activates the MUSA backend when:
+      - the plugin package is installed in the environment
+      - Moore Threads devices are visible to the process
+
+    The inference methods below are byte-for-byte identical in shape to the
+    NVIDIA vLLM runner — platform-specific logic is isolated to
+    ``_get_chip_count()``, ``load_model()``, ``get_peak_memory_gb()``, and
+    ``release_resources()``.
+    """
+
+    SUPPORTS_STREAMING = True
+    SUPPORTS_BATCHING = True
+    SUPPORTS_ONLINE = True
+    SUPPORTS_MULTI_CHIP = True  # MCCL-based tensor parallelism on multi-card hosts
+
+    # S4000 / S5000 advertise native BF16 for LLM workloads; FP16 always works
+    # as a fallback. FP32 is left in the list for completeness but is rarely
+    # used for inference. FP8 is excluded entirely — current shipping MUSA
+    # hardware does not expose native FP8 datapaths.
+    SUPPORTED_PRECISIONS = ["bf16", "fp16", "fp32"]
+
+    # Quantization backends — start conservative. compressed-tensors is the
+    # safe default on every modern vLLM build because the kernels are pure
+    # Triton + PyTorch matmuls and so are reachable through torchada.
+    # Marlin / AWQ-CUDA / native FP8 require kernel-level validation on MUSA
+    # and should be added in a follow-up runner version after real-hardware
+    # smoke tests, not silently flipped on here.
+    SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors"]
+
+    def __init__(self):
+        self.llm = None              # vllm.LLM (offline / accuracy)
+        self.engine = None           # vllm.AsyncLLMEngine (online / interactive)
+        self.tokenizer = None
+        self.sampling_params = None
+        self._loop: asyncio.AbstractEventLoop = None
+
+    # ── Metadata ─────────────────────────────────────────────────────────────
+
+    def _get_chip_count(self) -> int:
+        """Return the number of available Moore Threads MUSA GPUs.
+
+        Preference order:
+          1. ``pymtml`` (the Moore Threads management library, equivalent to
+             nvidia-ml-py). Most reliable because it queries the driver
+             directly and is not affected by ``MUSA_VISIBLE_DEVICES`` if
+             called before any ``torch`` initialisation.
+          2. ``torch.cuda.device_count()`` — torchada aliases ``torch.cuda``
+             to MUSA so this returns the visible MUSA device count in the
+             current process (respecting ``MUSA_VISIBLE_DEVICES``).
+        """
+        try:
+            import pymtml
+            pymtml.mtmlInit()
+            try:
+                n = pymtml.mtmlDeviceGetCount()
+            finally:
+                try:
+                    pymtml.mtmlShutdown()
+                except Exception:
+                    pass
+            if n and n > 0:
+                return int(n)
+        except Exception:
+            pass
+
+        try:
+            import torch
+            n = torch.cuda.device_count()
+            return n if n > 0 else 1
+        except Exception:
+            return 1
+
+    def _get_framework_name(self) -> str:
+        # The leaderboard groups by framework string; keep this distinct from
+        # plain "vLLM" so MUSA results are not silently mixed with CUDA results.
+        return "vllm-musa"
+
+    def _get_framework_version(self) -> str:
+        """Report vllm-musa plugin version, with vLLM core version appended.
+
+        The plugin version is the meaningful identifier (it pins the patch
+        set), but the underlying vLLM core version is what generates kernels
+        and parses configs. Reporting both makes results reproducible.
+        """
+        plugin_version = "unknown"
+        try:
+            from importlib.metadata import version
+            plugin_version = version("vllm-musa")
+        except Exception:
+            try:
+                import vllm_musa_platform  # type: ignore
+                plugin_version = getattr(vllm_musa_platform, "__version__", "unknown")
+            except Exception:
+                pass
+
+        core_version = "unknown"
+        try:
+            import vllm
+            core_version = vllm.__version__
+        except Exception:
+            pass
+
+        if plugin_version == "unknown" and core_version == "unknown":
+            return "unknown"
+        if plugin_version == "unknown":
+            return core_version
+        return f"{plugin_version}+vllm-{core_version}"
+
+    def get_model_format(self) -> str:
+        return "HuggingFace original"
+
+    # ── Model loading ────────────────────────────────────────────────────────
+
+    def load_model(self, model_path: str, parallelism: dict) -> None:
+        """
+        Load model onto Moore Threads MUSA GPU(s) via vllm-musa.
+
+        vllm-musa uses the standard vLLM ``LLM`` / ``AsyncLLMEngine``
+        constructors. The MUSA backend activates automatically when the
+        plugin package is installed and Moore Threads devices are present —
+        no explicit device flag is required in engine kwargs.
+
+        Pipeline parallelism is not supported (matches the vLLM CUDA backend
+        behaviour). Use ``tensor_parallel_size`` for multi-chip runs.
+        """
+        from transformers import AutoTokenizer
+        from vllm import LLM, AsyncLLMEngine, SamplingParams
+        from vllm.engine.arg_utils import AsyncEngineArgs
+
+        tp_size = parallelism["tensor_parallel_size"]
+        pp_size = parallelism["pipeline_parallel_size"]
+        ep_size = parallelism.get("expert_parallel_size", 1)
+        assert pp_size <= 1, (
+            "Pipeline parallelism (pp_size > 1) is not supported in "
+            "MoorethreadsVLLMMUSARunner. Use --tensor-parallel-size for "
+            "multi-chip runs."
+        )
+
+        max_tokens = parallelism["max_tokens"]
+        max_model_len = parallelism["max_model_len"]
+        use_async = parallelism["use_async"]
+        enforce_eager = getattr(self, "_enforce_eager", False)
+
+        cfg = getattr(self, "_runner_config", {})
+        max_num_seqs = cfg.get("max_num_seqs", 256)
+        # vLLM's flag name is gpu_memory_utilization, but on MUSA it controls
+        # the per-card HBM fraction reserved for the KV cache. We keep the
+        # vLLM name to stay schema-compatible with other runners' configs.
+        musa_memory_util = cfg.get("gpu_memory_utilization", 0.85)
+        extra_kwargs = dict(cfg.get("engine_kwargs") or {})
+
+        # Filter engine_kwargs to only fields the installed vLLM version
+        # accepts. EngineArgs is a strict dataclass — unknown kwargs raise
+        # TypeError at construction. vllm-musa supports vLLM 0.10.x and 0.13.x,
+        # whose EngineArgs fields differ slightly; filtering keeps the YAML
+        # forward-compatible.
+        try:
+            import dataclasses
+            from vllm.engine.arg_utils import EngineArgs as _EngineArgs
+            _valid = {f.name for f in dataclasses.fields(_EngineArgs)}
+            _dropped = {k: v for k, v in extra_kwargs.items() if k not in _valid}
+            if _dropped:
+                print(f"  Warning: engine_kwargs keys not supported by this "
+                      f"vllm-musa / vLLM version and will be ignored: "
+                      f"{list(_dropped)}")
+            extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid}
+        except Exception:
+            pass
+
+        effective_precision = getattr(self, "_effective_precision", "BF16").upper()
+        precision = getattr(self, "_precision", None) or effective_precision
+
+        _dtype_override = getattr(self, "_precision_dtype_override", None)
+        _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {})
+        quantization = _prec_eng_kwargs.pop("quantization", None)
+
+        _NATIVE_DTYPE_MAP = {"BF16": "bfloat16", "FP16": "float16", "FP32": "float32"}
+        dtype = _NATIVE_DTYPE_MAP.get(precision, "auto")
+        self._quantization_method = quantization
+
+        if _dtype_override:
+            dtype = _dtype_override
+        if _prec_eng_kwargs:
+            _prec_eng_kwargs.update(extra_kwargs)
+            extra_kwargs = _prec_eng_kwargs
+
+        # Translate the runner's flat speculative-decoding keys into the
+        # dict-form ``speculative_config`` used by recent vLLM versions. Skip
+        # if the user already provided ``speculative_config`` directly.
+        if "speculative_model" in extra_kwargs and "speculative_config" not in extra_kwargs:
+            extra_kwargs["speculative_config"] = {
+                "model": extra_kwargs.pop("speculative_model"),
+                "num_speculative_tokens": extra_kwargs.pop("num_speculative_tokens", 4),
+                "draft_tensor_parallel_size": extra_kwargs.pop(
+                    "speculative_draft_tensor_parallel_size", 1
+                ),
+            }
+
+        print(
+            f"Loading model: precision={precision}, dtype={dtype}"
+            + (f", quantization_method={self._quantization_method}"
+               if self._quantization_method else "")
+        )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=False
+        )
+
+        self.sampling_params = SamplingParams(
+            max_tokens=max_tokens,
+            temperature=0.0,
+        )
+
+        base_kwargs = dict(
+            model=model_path,
+            dtype=dtype,
+            tensor_parallel_size=tp_size,
+            trust_remote_code=False,
+            enforce_eager=enforce_eager,
+        )
+        if ep_size > 1:
+            base_kwargs["enable_expert_parallel"] = True
+        if quantization:
+            base_kwargs["quantization"] = quantization
+        if max_model_len:
+            base_kwargs["max_model_len"] = max_model_len
+
+        if not use_async:
+            self.llm = LLM(**{
+                **base_kwargs,
+                "max_num_seqs": max_num_seqs,
+                "gpu_memory_utilization": musa_memory_util,
+                **extra_kwargs,
+            })
+        else:
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+            engine_args = AsyncEngineArgs(**{
+                **base_kwargs,
+                "gpu_memory_utilization": musa_memory_util,
+                **extra_kwargs,
+            })
+            self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    def get_effective_dtype(self) -> Optional[str]:
+        """Report the actual compute dtype vllm-musa resolved after loading."""
+        try:
+            if self.llm is not None:
+                return str(self.llm.llm_engine.model_config.dtype).replace("torch.", "")
+            elif self.engine is not None:
+                return str(self.engine.engine.model_config.dtype).replace("torch.", "")
+        except Exception:
+            pass
+        return getattr(self, "_effective_dtype", None)
+
+    # ── Inference ────────────────────────────────────────────────────────────
+
+    def inference_fn_offline(
+        self, requests: list[InferenceRequest]
+    ) -> list[InferenceResult]:
+        """
+        Synchronous batch inference via vllm-musa LLM.generate().
+        total_time_ms is wall-clock elapsed time for the full batch.
+        """
+        formatted = [self._format_prompt(r.prompt) for r in requests]
+        t_start = time.perf_counter()
+        outputs = self.llm.generate(formatted, self.sampling_params)
+        elapsed = time.perf_counter() - t_start
+
+        self._last_accuracy_outputs = [o.outputs[0].text for o in outputs]
+
+        results = []
+        for output in outputs:
+            results.append(InferenceResult(
+                first_token_time_ms=None,
+                total_time_ms=elapsed * 1000,
+                output_tokens=len(output.outputs[0].token_ids),
+                input_tokens=len(output.prompt_token_ids),
+                success=True,
+                output_text=output.outputs[0].text,
+            ))
+        return results
+
+    async def inference_fn_streaming(
+        self, request: InferenceRequest
+    ) -> InferenceResult:
+        """Async streaming for TTFT — API identical to NVIDIA vLLM runner."""
+        from vllm.utils import random_uuid
+
+        formatted = self._format_prompt(request.prompt)
+        request_id = random_uuid()
+        t_start = time.perf_counter()
+        first_token_time_ms = None
+        output_tokens = 0
+        output_text = ""
+
+        async for output in self.engine.generate(
+            formatted, self.sampling_params, request_id
+        ):
+            if (
+                first_token_time_ms is None
+                and len(output.outputs[0].token_ids) > 0
+            ):
+                first_token_time_ms = (time.perf_counter() - t_start) * 1000
+            output_tokens = len(output.outputs[0].token_ids)
+            output_text = output.outputs[0].text
+
+        total_time_ms = (time.perf_counter() - t_start) * 1000
+        return InferenceResult(
+            first_token_time_ms=first_token_time_ms,
+            total_time_ms=total_time_ms,
+            output_tokens=output_tokens,
+            input_tokens=0,
+            success=True,
+            output_text=output_text,
+        )
+
+    async def inference_fn_token_stream(self, request: InferenceRequest):
+        """Async generator yielding text deltas for serve-layer SSE."""
+        from vllm.utils import random_uuid
+
+        formatted = self._format_prompt(request.prompt)
+        request_id = random_uuid()
+        prev_length = 0
+
+        async for output in self.engine.generate(
+            formatted, self.sampling_params, request_id
+        ):
+            current_text = output.outputs[0].text
+            delta = current_text[prev_length:]
+            if delta:
+                yield delta
+                prev_length = len(current_text)
+
+    # ── Memory & teardown ────────────────────────────────────────────────────
+
+    def get_peak_memory_gb(self) -> Optional[float]:
+        """Query peak HBM usage on the active MUSA device.
+
+        torchada aliases ``torch.cuda.max_memory_allocated()`` onto MUSA, so
+        the standard CUDA API returns peak MUSA memory. We fall back to
+        ``pymtml`` if torch is unavailable for some reason.
+        """
+        try:
+            import torch
+            return torch.cuda.max_memory_allocated() / (1024 ** 3)
+        except Exception:
+            pass
+        # pymtml fallback — returns currently-used memory, not strictly peak,
+        # but useful when torch.cuda is gone.
+        try:
+            import pymtml
+            pymtml.mtmlInit()
+            try:
+                dev = pymtml.mtmlDeviceGetByIndex(0)
+                info = pymtml.mtmlDeviceGetMemoryInfo(dev)
+                used = getattr(info, "used", None)
+                if used is not None:
+                    return float(used) / (1024 ** 3)
+            finally:
+                try:
+                    pymtml.mtmlShutdown()
+                except Exception:
+                    pass
+        except Exception:
+            pass
+        return None
+
+    def release_resources(self) -> None:
+        """
+        Release vllm-musa engines and MUSA memory.
+
+        Teardown order mirrors the NVIDIA runner:
+          1. Shut down async engine (if online/interactive was used)
+          2. Delete engine objects to trigger Python GC
+          3. vLLM distributed-state cleanup (cleanup_dist_env_and_memory)
+          4. MCCL / torch.distributed process group destruction
+          5. MUSA memory cache flush via torch.cuda (aliased to MUSA by torchada)
+        """
+        if self.llm is not None:
+            try:
+                del self.llm
+            except Exception:
+                pass
+            self.llm = None
+
+        if self.engine is not None:
+            try:
+                if self._loop and not self._loop.is_closed():
+                    self._loop.run_until_complete(self.engine.shutdown())
+            except Exception:
+                pass
+            try:
+                del self.engine
+            except Exception:
+                pass
+            self.engine = None
+
+        # vLLM distributed state cleanup. cleanup_dist_env_and_memory is the
+        # same entry point as upstream vLLM — vllm-musa patches the internals
+        # but keeps the public function name.
+        try:
+            from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+            cleanup_dist_env_and_memory(shutdown_ray=False)
+        except Exception:
+            try:
+                from vllm.distributed.parallel_state import (
+                    destroy_model_parallel,
+                    destroy_distributed_environment,
+                )
+                destroy_model_parallel()
+                destroy_distributed_environment()
+            except Exception:
+                pass
+
+        # Destroy the active torch.distributed process group. On MUSA the
+        # backend is MCCL (Moore Threads Collective Communications Library)
+        # but is exposed through the standard torch.distributed.destroy_process_group
+        # entry point thanks to torchada.
+        try:
+            import torch
+            if torch.distributed.is_initialized():
+                torch.distributed.destroy_process_group()
+        except Exception:
+            pass
+
+        gc.collect()
+
+        # Flush MUSA memory cache. torch.cuda.* is aliased to MUSA by torchada,
+        # so the standard CUDA cache-management APIs work without modification.
+        try:
+            import torch
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+        except Exception:
+            pass
+
+    # ── Argument parsing ─────────────────────────────────────────────────────
+
+    def parse_args(self):
+        """Add vllm-musa / Moore Threads-specific CLI flags."""
+        args = super().parse_args()
+        cfg = self._runner_config
+
+        import argparse
+        parser = argparse.ArgumentParser(add_help=False)
+        parser.add_argument("--tensor-parallel-size", type=int, default=None,
+                            dest="tensor_parallel_size")
+        parser.add_argument("--expert-parallel-size", type=int, default=None,
+                            dest="expert_parallel_size")
+        parser.add_argument("--enforce-eager", action="store_true", default=False,
+                            dest="enforce_eager")
+        extra, _ = parser.parse_known_args()
+
+        tp_size, _tp_source = self._resolve_tensor_parallel_size(
+            extra.tensor_parallel_size
+        )
+        ep_size = (extra.expert_parallel_size
+                   if extra.expert_parallel_size is not None
+                   else cfg.get("expert_parallel_size", 1))
+
+        self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False)
+
+        print(f"  tensor_parallel_size = {tp_size}  [{_tp_source}]")
+        if ep_size > 1:
+            print(f"  expert_parallel_size = {ep_size}  [cli/yaml]")
+
+        self._parallelism = {
+            "tensor_parallel_size":   tp_size,
+            "pipeline_parallel_size": 1,
+            "expert_parallel_size":   ep_size,
+            "data_parallel_size":     1,
+        }
+        self._chip_count = tp_size
+        return args
+
+    def get_extra_subprocess_args(self, args) -> list[str]:
+        """Forward vllm-musa / Moore Threads-specific flags to subprocesses."""
+        extra = [
+            "--tensor-parallel-size",
+            str(self._parallelism.get("tensor_parallel_size", 1)),
+        ]
+        if self._parallelism.get("expert_parallel_size", 1) > 1:
+            extra += ["--expert-parallel-size",
+                      str(self._parallelism["expert_parallel_size"])]
+        if self._enforce_eager:
+            extra += ["--enforce-eager"]
+        return extra
+
+
+if __name__ == "__main__":
+    MoorethreadsVLLMMUSARunner().main()

From 61b977ed0f3dc7cbc9681cce93520dace1d6e699 Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Mon, 18 May 2026 16:51:21 +0800
Subject: [PATCH 2/5] update moore runner

---
 README.md                                     |   2 +-
 ...rethreads_vllm_musa_f2f6f965.yaml.example} |   6 +-
 runners/README.md                             |   2 +-
 .../moorethreads_vllm_musa_57ff5443/README.md | 200 ------------
 .../moorethreads_vllm_musa_57ff5443/meta.json |  21 --
 .../requirements.txt                          |  58 ----
 .../moorethreads_vllm_musa_f2f6f965/README.md | 145 +++++++++
 .../moorethreads_vllm_musa_f2f6f965/meta.json |  21 ++
 .../requirements.txt                          |  22 ++
 .../runner.py                                 | 303 +++++-------------
 .../test_smoke.py                             |  77 +++++
 runners/platforms/moorethreads.py             | 192 +++++++----
 12 files changed, 487 insertions(+), 562 deletions(-)
 rename configs/runner_configs/{runner_moorethreads_vllm_musa_57ff5443.yaml.example => runner_moorethreads_vllm_musa_f2f6f965.yaml.example} (91%)
 delete mode 100644 runners/moorethreads_vllm_musa_57ff5443/README.md
 delete mode 100644 runners/moorethreads_vllm_musa_57ff5443/meta.json
 delete mode 100644 runners/moorethreads_vllm_musa_57ff5443/requirements.txt
 create mode 100644 runners/moorethreads_vllm_musa_f2f6f965/README.md
 create mode 100644 runners/moorethreads_vllm_musa_f2f6f965/meta.json
 create mode 100644 runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
 rename runners/{moorethreads_vllm_musa_57ff5443 => moorethreads_vllm_musa_f2f6f965}/runner.py (52%)
 create mode 100644 runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py

diff --git a/README.md b/README.md
index 92cec27..2ca3d64 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
 | Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — |
-| Moore Threads GPU | `moorethreads_vllm_musa_57ff5443` | vllm-musa | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — |
+| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — |
 
 _Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._
 <!-- platforms-matrix:end -->
diff --git a/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example
similarity index 91%
rename from configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example
rename to configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example
index 5c8f878..c18f98b 100644
--- a/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example
+++ b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example
@@ -1,6 +1,6 @@
-# AccelMark runner config — moorethreads_vllm_musa_57ff5443 (vllm-musa on Moore Threads)
+# AccelMark runner config — moorethreads_vllm_musa_f2f6f965 (vllm-musa on Moore Threads)
 #
-# Copy this file to runner_moorethreads_vllm_musa_57ff5443.yaml (remove
+# Copy this file to runner_moorethreads_vllm_musa_f2f6f965.yaml (remove
 # .example suffix) and edit as needed for your hardware. The actual .yaml
 # is gitignored.
 #
@@ -45,8 +45,6 @@ suites:
     gpu_memory_utilization: 0.80
 
   suite_F:
-    # Consumer / edge GPU — enforce_eager often safer for first runs.
-    # enforce_eager: true
     max_num_seqs: 128
 
 # ── Speculative decoding (suite_A / suite_D extra scenario) ─────────────────
diff --git a/runners/README.md b/runners/README.md
index 95290aa..aaf4d81 100644
--- a/runners/README.md
+++ b/runners/README.md
@@ -252,7 +252,7 @@ nvidia_trtllm_fp8_8d2f1a4b
 amd_vllm_rocm_7b2e1d8f
 ascend_mindie_9c4a3f11
 apple_mlx_b3e21f09
-moorethreads_vllm_musa_57ff5443
+moorethreads_vllm_musa_f2f6f965
 ```
 
 ---
diff --git a/runners/moorethreads_vllm_musa_57ff5443/README.md b/runners/moorethreads_vllm_musa_57ff5443/README.md
deleted file mode 100644
index 82411a1..0000000
--- a/runners/moorethreads_vllm_musa_57ff5443/README.md
+++ /dev/null
@@ -1,200 +0,0 @@
-# moorethreads_vllm_musa_57ff5443 — Moore Threads MUSA Runner (vllm-musa)
-
-AccelMark runner for Moore Threads MUSA GPUs using
-[vllm-musa](https://github.com/MooreThreads/vllm-musa), the official vLLM
-platform plugin for MUSA hardware.
-
-> **Status:** This runner is **untested on real silicon at the time of
-> commit**. The code is written against the public `vllm-musa` plugin
-> documentation and follows the structural template of the
-> `ascend_vllm_ascend_*` runner. Plan to smoke-test on an S5000 / S4000
-> system; capability flags and dtype mappings may be adjusted in a follow-up
-> runner version (new hash, new folder) based on real-world findings.
-
-## How vllm-musa works
-
-`vllm-musa` is a vLLM **platform plugin** (auto-detected on `import vllm`)
-that makes the standard vLLM Python API run on Moore Threads MUSA GPUs. It
-relies on three components:
-
-| Component | Role |
-|---|---|
-| `torchada` | CUDA→MUSA compatibility layer for PyTorch — aliases `torch.cuda.*` to MUSA so most code paths run unmodified |
-| `pymtml` (`mthreads-ml-py`) | Moore Threads Management Library bindings, equivalent to `nvidia-ml-py` |
-| Triton patches | Runtime monkey-patches in `vllm_musa_platform.patches.*` that fix `triton.attention` and `worker` modules for MUSA's Triton compiler |
-
-The standard `vllm.LLM`, `vllm.AsyncLLMEngine`, and `vllm.SamplingParams`
-remain the entry points — this runner therefore reuses ~95% of the logic
-from the NVIDIA / Ascend vLLM runners.
-
-## Supported suites
-
-| Suite | Description | Notes |
-|-------|-------------|-------|
-| Suite A | Single-chip, Llama-3-8B | Pending smoke test on S4000 / S5000 |
-| Suite B | Multi-chip, Llama-3-70B | Requires multiple Moore Threads cards + MCCL TP |
-| Suite C | Quantization, Llama-3.1-8B | FP8 skipped (no native FP8 in current MUSA hardware); compressed-tensors W8A8/W8A16 candidate; AWQ / GPTQ pending validation |
-| Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` and `gpu_memory_utilization` |
-| Suite E | Multi-chip scaling, Llama-3-8B | Validates MCCL tensor parallelism |
-| Suite F | Consumer/edge, Qwen2.5-0.5B | Recommended starting point for S4000 single-card systems |
-
-## Hardware compatibility
-
-| GPU | BF16 | TP via MCCL | FP8 | Notes |
-|-----|------|-------------|-----|-------|
-| MTT S5000 | ✅ | ✅ | ❌ | Recommended public reference target (FA3 via MATE) |
-| MTT S4000 | ✅ | ✅ | ❌ | Validated path with PyTorch SDPA-based FlashAttention |
-| MTT S3000 | ⚠️ | ⚠️ | ❌ | May work via `--enforce-eager`; not the public reference |
-| MTT S80 | ⚠️ | — | ❌ | Consumer card; treat as best-effort |
-
-## Prerequisites
-
-You must install the MUSA stack in this exact order — Python packages alone
-are not sufficient:
-
-**1. MUSA toolkit + driver**
-
-Match the toolkit version to your card firmware. Reference:
-<https://developer.mthreads.com/musa/>
-
-**2. PyTorch with MUSA support (torch + torchada)**
-
-The recommended path is the official Moore Threads container, which ships a
-pre-built `torch==2.7.1` together with `torchada` and `pymtml`. See:
-
-```bash
-docker pull sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120
-```
-
-**3. Runner dependencies**
-
-Inside the MUSA container:
-
-```bash
-pip install -r runners/moorethreads_vllm_musa_57ff5443/requirements.txt
-```
-
-This installs `vllm-musa==0.1.1` which auto-pulls a validated vLLM core
-(`0.10.1.1` by default). To use vLLM `0.13.0` instead (V1-only engine):
-
-```bash
-pip install vllm==0.13.0 --no-deps --upgrade
-pip install 'depyf==0.20.0' 'llguidance>=1.3.0,<1.4.0' \
-            'lm-format-enforcer==0.11.3' 'outlines_core==0.2.11' \
-            'xgrammar==0.1.27' 'compressed-tensors==0.12.2'
-```
-
-## Required environment variables
-
-```bash
-# Device visibility (works like CUDA_VISIBLE_DEVICES)
-export MUSA_VISIBLE_DEVICES=0,1,2,3
-
-# Recommended for multi-process workers (TP > 1)
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
-```
-
-## Basic usage
-
-```bash
-# Verify the plugin is loaded before running anything else
-python -c "from vllm_musa_platform import musa_platform_plugin; print('ok')"
-
-# Suite F (single-card S4000 / S5000)
-python run.py --runner moorethreads_vllm_musa_57ff5443 --suite suite_F
-
-# Suite A (single-card datacenter benchmark)
-python run.py --runner moorethreads_vllm_musa_57ff5443 --suite suite_A
-
-# Multi-card tensor parallelism (e.g. 8 x S5000 on a single host)
-VLLM_WORKER_MULTIPROC_METHOD=spawn \
-python run.py --runner moorethreads_vllm_musa_57ff5443 \
-    --suite suite_B \
-    --tensor-parallel-size 8
-
-# Local model cache
-python run.py --runner moorethreads_vllm_musa_57ff5443 \
-    --suite suite_A \
-    --model-path /data/models/Meta-Llama-3-8B-Instruct
-```
-
-## Runner config
-
-Copy the example config and adjust for your hardware:
-
-```bash
-cp configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example \
-   configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml
-```
-
-Key settings:
-
-| Field | Default | Notes |
-|-------|---------|-------|
-| `tensor_parallel_size` | 1 | Number of MUSA GPUs for tensor parallelism |
-| `enforce_eager` | false | Disable CUDA-graph / compilation; useful for pre-S4000 cards or while debugging |
-| `max_num_seqs` | 256 | Max concurrent sequences; reduce on lower-memory cards |
-| `gpu_memory_utilization` | 0.85 | Fraction of HBM reserved for KV cache; reduce if OOM |
-
-## Triton / kernel compilation errors
-
-If you encounter errors during Triton graph capture on first request,
-disable graph capture with `--enforce-eager`:
-
-```bash
-python run.py --runner moorethreads_vllm_musa_57ff5443 \
-    --suite suite_F --enforce-eager
-```
-
-Or set persistently in the runner config YAML:
-
-```yaml
-enforce_eager: true
-```
-
-## HBM OOM errors
-
-Reduce `gpu_memory_utilization` and/or `max_num_seqs`, either globally or
-per-suite (Suite D is the most memory-hungry due to long-context inputs):
-
-```yaml
-gpu_memory_utilization: 0.80
-max_num_seqs: 128
-
-suites:
-  suite_D:
-    max_num_seqs: 32
-    gpu_memory_utilization: 0.78
-```
-
-## Known gaps (pre-smoke-test)
-
-The following items are placeholders and **must be re-validated** on real
-S4000 / S5000 hardware:
-
-- **Memory peak**: relies on `torch.cuda.max_memory_allocated()` which
-  torchada aliases to MUSA. If this returns 0 or `None`, fall back to
-  `pymtml.mtmlDeviceGetMemoryInfo()`.
-- **MCCL teardown**: assumes the same `cleanup_dist_env_and_memory` entry
-  point as upstream vLLM. If MCCL leaves a hanging process group, the
-  fallback path explicitly destroys the torch.distributed group.
-- **Quantization**: `SUPPORTED_QUANTIZATION_BACKENDS` currently lists only
-  `compressed-tensors`. AWQ / GPTQ-Marlin / FP8 are intentionally excluded
-  until kernel coverage on MUSA is confirmed.
-- **Precision detection**: `_get_chip_count()` prefers `pymtml` over
-  `torch.cuda.device_count()`. On hosts where pymtml is missing this may
-  miscount; in that case the torch fallback should still work because
-  torchada provides `torch.cuda.device_count()`.
-
-## Requirements
-
-See `requirements.txt` for the pinned plugin / extras list. The heavy
-dependencies (torch + torchada + MUSA toolkit) must come from the Moore
-Threads container; do not install them from PyPI.
-
-Minimum environment:
-- Moore Threads MTT S4000 or newer (S3000 / S80 best-effort)
-- MUSA toolkit + driver matching card firmware
-- torch 2.7.1 (Moore Threads MUSA build) + torchada ≥ 0.1.9
-- Python 3.10+
-- vllm-musa 0.1.1 (vLLM core 0.10.1.1 or 0.13.0)
diff --git a/runners/moorethreads_vllm_musa_57ff5443/meta.json b/runners/moorethreads_vllm_musa_57ff5443/meta.json
deleted file mode 100644
index 655a6ef..0000000
--- a/runners/moorethreads_vllm_musa_57ff5443/meta.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "id": "moorethreads_vllm_musa_57ff5443",
-  "platform": "moorethreads",
-  "name": "vllm-musa on Moore Threads MUSA GPU",
-  "framework": "vllm-musa",
-  "submitted_by": "JuhaoLiang1997",
-  "description": "AccelMark runner for Moore Threads MTT S4000 / S5000 MUSA GPUs via the vllm-musa platform plugin (vLLM 0.10.x / 0.13.x + torchada CUDA→MUSA compatibility + pymtml). API-compatible with standard vLLM; MCCL-based tensor parallelism. FP8 excluded — not supported on current MUSA hardware. Quantization limited to compressed-tensors (W8A8/W8A16) pending real-hardware validation of AWQ / GPTQ / FP8 paths.",
-  "supersedes_chain": [],
-  "notes": "Initial Moore Threads runner. Written from the public vllm-musa documentation and the structural template of ascend_vllm_ascend_d4aa9fda; capability flags, dtype mapping and teardown sequence are placeholders awaiting smoke-testing on real S4000 / S5000 silicon.",
-  "created": "2026-05-15",
-  "hardware_label": null,
-  "suite_support": {
-    "A": "pending",
-    "B": "pending",
-    "C": "pending",
-    "D": "pending",
-    "E": "pending",
-    "F": "pending",
-    "G": "unsupported"
-  }
-}
diff --git a/runners/moorethreads_vllm_musa_57ff5443/requirements.txt b/runners/moorethreads_vllm_musa_57ff5443/requirements.txt
deleted file mode 100644
index 2a44733..0000000
--- a/runners/moorethreads_vllm_musa_57ff5443/requirements.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-# AccelMark -- Moore Threads MUSA vllm-musa runner dependencies
-#
-# This runner is designed to run inside the official Moore Threads MUSA
-# container (which already ships torch + torchada built for the MUSA
-# toolkit) and only installs the vLLM platform plugin + accelmark extras
-# on top of it.
-#
-# Tested image (subject to change at smoke-test time):
-#   sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120
-# Reference docker command:
-#   docker run -d --net host --privileged --pid=host --shm-size 500g \
-#     -v $PWD:/ws -w /ws \
-#     --name accelmark-musa \
-#     sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120 \
-#     sleep infinity
-#   docker exec -it accelmark-musa bash
-#
-# Pre-installed in the container (do NOT reinstall via pip):
-#   torch==2.7.1              (built for MUSA with torchada)
-#   torchada>=0.1.9           (CUDA→MUSA compatibility layer)
-#   mthreads-ml-py>=2.2.5     (pymtml — MTML bindings)
-#
-# vLLM core: the plugin pulls in a compatible version automatically, but for
-# reproducibility we pin to one of the validated combinations below.
-# Pick ONE of these two stacks (uncomment the matching line in the install
-# guide in README.md):
-#
-#   stack A — vLLM 0.10.1.1 (V0 + V1 engines):
-#     pip install -e .   # plugin auto-installs vllm==0.10.1.1
-#
-#   stack B — vLLM 0.13.0 (V1-only):
-#     pip install -e .                      # plugin installs vllm==0.10.1.1
-#     pip install vllm==0.13.0 --no-deps --upgrade
-#     pip install 'depyf==0.20.0' 'llguidance>=1.3.0,<1.4.0' \
-#                 'lm-format-enforcer==0.11.3' 'outlines_core==0.2.11' \
-#                 'xgrammar==0.1.27' 'compressed-tensors==0.12.2'
-
-# vLLM MUSA platform plugin (PyPI: vllm-musa, GitHub: MooreThreads/vllm-musa)
-vllm-musa==0.1.1
-
-# Transformers stack — pin to versions compatible with vLLM 0.10.x / 0.13.x
-transformers==4.46.3
-tokenizers==0.20.3
-huggingface-hub==0.26.5
-accelerate==1.2.1
-safetensors==0.4.5
-
-# AccelMark dependencies (not bundled in the image)
-numpy==1.26.4
-jsonschema==4.25.1
-psutil==7.1.0
-tqdm==4.67.1
-
-# Async support
-aiohttp==3.12.15
-
-# Config file parsing
-PyYAML==6.0.2
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/README.md b/runners/moorethreads_vllm_musa_f2f6f965/README.md
new file mode 100644
index 0000000..e963d18
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/README.md
@@ -0,0 +1,145 @@
+# moorethreads_vllm_musa_f2f6f965 — Moore Threads MUSA Runner (vllm-musa)
+
+AccelMark runner for Moore Threads MUSA GPUs using
+[vllm-musa](https://github.com/MooreThreads/vllm-musa).
+
+## Supported suites
+
+| Suite | Description | Notes |
+|-------|-------------|-------|
+| Suite A | Single-chip, Llama-3-8B | Smoke tested on MTT S4000; accuracy not at baseline on vLLM 0.4.x |
+| Suite B | Multi-chip, Llama-3-70B | MCCL tensor parallelism; set `VLLM_WORKER_MULTIPROC_METHOD=spawn` |
+| Suite C | Quantization, Llama-3.1-8B | FP8 skipped (not supported); W8A8/W8A16 via compressed-tensors |
+| Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` / `gpu_memory_utilization` in runner config |
+| Suite E | Multi-chip scaling, Llama-3-8B | MCCL tensor parallelism |
+| Suite F | Edge, Qwen2.5-0.5B | Smoke tested on MTT S4000; recommended first run |
+| Suite G | MoE multi-chip, Mixtral-8x7B | Unsupported |
+
+## Hardware compatibility
+
+| GPU | BF16 / FP16 | Multi-chip TP | FP8 | Notes |
+|-----|-------------|---------------|-----|-------|
+| MTT S4000 / S5000 | ✅ (BF16 → float16 on vLLM &lt; 0.10) | ✅ (MCCL) | ❌ | Tested with vLLM 0.4.x+musa |
+| MTT S3000 / S80 | ✅ | ✅ | ❌ | May need `--enforce-eager` on Triton errors |
+
+FP8 is excluded — not supported on this runner. FP32 inference fails with
+FlashAttention on MUSA (use FP16 or BF16). Qwen3 requires a newer vLLM + MUSA port
+(Qwen2.5 / Llama-3 work on 0.4.x).
+
+## Prerequisites
+
+Install in this order — **do not** `pip install torch` or `vllm` from PyPI on a
+bare Linux host:
+
+**1. MUSA toolkit + driver**
+
+<https://developer.mthreads.com/musa/>
+
+**2. vllm-musa (official build)**
+
+| Resource | URL |
+|----------|-----|
+| Repository | <https://github.com/MooreThreads/vllm-musa> |
+| Build guide | [README_vllm_musa.md](https://github.com/MooreThreads/vllm-musa/blob/main/README_vllm_musa.md) |
+| PyTorch MUSA | <https://github.com/MooreThreads/torch_musa> |
+
+```bash
+git clone https://github.com/MooreThreads/vllm-musa.git
+cd vllm-musa
+bash build_musa.sh
+python -c "from vllm import LLM; print('vllm ok')"
+```
+
+**3. Runner dependencies**
+
+```bash
+pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
+```
+
+Pin `transformers` to **4.40–4.46** (not 5.x) when on vLLM 0.4.x.
+
+**Environment variables**
+
+```bash
+export MUSA_VISIBLE_DEVICES=0
+export VLLM_WORKER_MULTIPROC_METHOD=spawn   # when tensor_parallel_size > 1
+```
+
+## Smoke test
+
+```bash
+python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model
+```
+
+## Accuracy
+
+AccelMark runs an integrated MMLU subset after each benchmark using the **same**
+vLLM instance as the perf run. The runner sets `device=musa`, dtype, and
+tokenizer correctly; low scores on vLLM **0.4.x+musa** reflect broken generation
+in that stack, not missing AccelMark wiring.
+
+| Model | Suite | Measured | Baseline |
+|-------|-------|----------|----------|
+| Qwen2.5-0.5B-Instruct | F | **~0.07** | 0.37 (FP16) / 0.38 (BF16) |
+| Llama-3-8B-Instruct | A | **~0.07** | 0.60 (BF16) |
+
+Throughput completes normally; answers are effectively random (repetition, system
+prompt regurgitation, similar ~7% across different models).
+
+While accuracy is broken on 0.4.x, use `--skip-accuracy-gate` to finish a perf run:
+
+```bash
+python run.py --runner moorethreads_vllm_musa_f2f6f965 \
+  --suite suite_F --precision FP16 --skip-accuracy-gate
+```
+
+Likely fix: upgrade to vllm-musa aligned with vLLM **0.10+**, keep
+`transformers` 4.40–4.46 on legacy forks, then re-run without
+`--skip-accuracy-gate`.
+
+## Usage
+
+```bash
+python run.py --runner moorethreads_vllm_musa_f2f6f965 --suite suite_F --precision FP16
+
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+python run.py --runner moorethreads_vllm_musa_f2f6f965 \
+  --suite suite_B --tensor-parallel-size 8
+```
+
+Optional runner config (copy and edit):
+
+```bash
+cp configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example \
+   configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml
+```
+
+| Field | Default | Notes |
+|-------|---------|-------|
+| `tensor_parallel_size` | 1 | MCCL tensor parallelism |
+| `enforce_eager` | false | Only if Triton / graph capture errors |
+| `max_num_seqs` | 256 | Lower on small HBM |
+| `gpu_memory_utilization` | 0.85 | Lower if OOM |
+
+## Troubleshooting
+
+| Symptom | Fix |
+|---------|-----|
+| `GLIBCXX_3.4.30` on import | Import `torch` before `transformers` (runner and smoke test do this) |
+| `KeyError: 'type'` in rope_scaling | Pin `transformers==4.46.3` (not 5.x) |
+| `Expected musa device, got cuda:0` | Use this runner (`device="musa"`) |
+| MMLU ~0.07 | See [Accuracy](#accuracy); `--skip-accuracy-gate` for perf-only runs |
+| OOM | Lower `gpu_memory_utilization` / `max_num_seqs` |
+| Triton / graph errors | `--enforce-eager` or `enforce_eager: true` in runner YAML |
+
+## Requirements
+
+See `requirements.txt` for AccelMark extras. vLLM, torch_musa, and the MUSA
+driver are installed per the official vllm-musa guide above (not from this file).
+
+Minimum environment:
+
+- Moore Threads GPU with MUSA driver
+- Python 3.10+
+- vllm-musa build per [MooreThreads/vllm-musa](https://github.com/MooreThreads/vllm-musa)
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/meta.json b/runners/moorethreads_vllm_musa_f2f6f965/meta.json
new file mode 100644
index 0000000..9d5728d
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/meta.json
@@ -0,0 +1,21 @@
+{
+  "id": "moorethreads_vllm_musa_f2f6f965",
+  "platform": "moorethreads",
+  "name": "vllm-musa on Moore Threads MUSA GPU",
+  "framework": "vllm-musa",
+  "submitted_by": "JuhaoLiang1997",
+  "description": "AccelMark runner for Moore Threads MUSA GPUs using vllm-musa (https://github.com/MooreThreads/vllm-musa). Install torch/vllm per upstream README_vllm_musa.md; requirements.txt adds benchmark deps only. Sets device=musa; BF16 maps to float16 on vLLM <0.10. MCCL tensor parallelism. FP8 unsupported.",
+  "supersedes_chain": [],
+  "notes": "MMLU not at baseline on tested vLLM 0.4.x+musa stack — see runner README.",
+  "created": "2026-05-18",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "pending",
+    "B": "pending",
+    "C": "pending",
+    "D": "pending",
+    "E": "pending",
+    "F": "pending",
+    "G": "unsupported"
+  }
+}
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
new file mode 100644
index 0000000..1fe16ee
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
@@ -0,0 +1,22 @@
+# AccelMark — moorethreads_vllm_musa_f2f6f965
+#
+# AccelMark benchmark dependencies only. Install MUSA toolkit, torch_musa, and
+# vllm-musa first — see README.md and https://github.com/MooreThreads/vllm-musa
+#
+#   pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt
+#   python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+
+# AccelMark / loadgen
+numpy==1.26.4
+jsonschema==4.25.1
+psutil==7.1.0
+tqdm==4.67.1
+aiohttp==3.12.15
+PyYAML==6.0.2
+
+# Tokenizer / config (pin to match vLLM 0.4.x — see README)
+transformers>=4.43.0,<4.47.0
+tokenizers>=0.20.0,<0.21.0
+huggingface-hub>=0.26.0,<0.27.0
+accelerate>=1.2.0,<1.3.0
+safetensors>=0.4.5,<0.5.0
diff --git a/runners/moorethreads_vllm_musa_57ff5443/runner.py b/runners/moorethreads_vllm_musa_f2f6f965/runner.py
similarity index 52%
rename from runners/moorethreads_vllm_musa_57ff5443/runner.py
rename to runners/moorethreads_vllm_musa_f2f6f965/runner.py
index d753330..b693369 100644
--- a/runners/moorethreads_vllm_musa_57ff5443/runner.py
+++ b/runners/moorethreads_vllm_musa_f2f6f965/runner.py
@@ -1,59 +1,8 @@
 """
-AccelMark — Moore Threads MUSA GPU benchmark runner (vllm-musa).
-
-Implements BenchmarkRunner for vLLM on Moore Threads MUSA GPUs via the
-``vllm-musa`` platform plugin. All orchestration logic lives in
-``runners/benchmark_runner.py``.
-
-The plugin works by patching vLLM at import time:
-  - ``torchada`` aliases the CUDA Python API onto MUSA
-  - ``pymtml`` (mthreads-ml-py) provides device queries equivalent to
-    nvidia-ml-py
-  - A few Triton attention/worker patches are applied to make the standard
-    vLLM kernels run on MUSA's Triton compiler.
-
-As a result, the standard vLLM Python API (``LLM``, ``AsyncLLMEngine``,
-``SamplingParams``) is fully preserved. This runner is therefore structurally
-identical to the NVIDIA / AMD / Ascend vLLM runners — the differences are
-in capability flags, device-count detection, and memory teardown.
-
-Hardware:     Moore Threads MTT S4000 / S5000 (and forward-compatible
-              successors). S3000 / S80 may also work but are not the public
-              reference target.
-Runtime:      MUSA (Meta-computing Unified System Architecture)
-Framework:    vllm-musa — https://github.com/MooreThreads/vllm-musa
-              (also published on PyPI as ``vllm-musa``)
-Precision:    BF16 (preferred on S4000+), FP16 fallback. FP8 not yet
-              supported on shipping MUSA hardware.
-Quantization: compressed-tensors (W8A8 / W8A16) declared by default. AWQ /
-              GPTQ / FP8 may be added once validated on real hardware.
-Multi-chip:   Tensor parallelism via MCCL (Moore Threads Collective
-              Communications Library). vLLM's tensor_parallel_size flag works
-              unchanged because torchada aliases the NCCL API surface.
-Streaming:    Fully supported — AsyncLLMEngine API is identical to vLLM.
-
-Installation (without a real device this is "informational"; final
-versions to be confirmed at smoke-test time):
-
-    # 1. Install the MUSA toolkit + driver matching your card firmware:
-    #    https://developer.mthreads.com/musa/
-    # 2. Install Moore Threads' PyTorch build (torch + torchada) inside the
-    #    official MUSA container, then:
-    pip install -r runners/moorethreads_vllm_musa_{hash8}/requirements.txt
-
-Usage:
-
-    # S5000 single chip
-    python run.py --runner moorethreads_vllm_musa_{hash8} --suite suite_F
-
-    # Multi-chip tensor parallelism (e.g. 8 x S5000)
-    VLLM_WORKER_MULTIPROC_METHOD=spawn \
-    python run.py --runner moorethreads_vllm_musa_{hash8} \
-        --suite suite_B --tensor-parallel-size 8
-
-Environment variables you might want to set:
-    MUSA_VISIBLE_DEVICES        — equivalent to CUDA_VISIBLE_DEVICES
-    VLLM_WORKER_MULTIPROC_METHOD=spawn   — recommended for multi-process workers
+AccelMark — Moore Threads MUSA vLLM benchmark runner (vllm-musa).
+
+Implements BenchmarkRunner for vllm-musa on Moore Threads MUSA GPUs.
+See README.md in this folder for install and hardware notes.
 """
 
 import asyncio
@@ -63,74 +12,38 @@
 from pathlib import Path
 from typing import Optional
 
-# Add repo root to path
 _REPO_ROOT = Path(__file__).resolve().parent.parent.parent
 sys.path.insert(0, str(_REPO_ROOT))
 
 from runners.benchmark_runner import BenchmarkRunner, InferenceRequest
 from loadgen.types import InferenceResult
 
-
 import logging
 logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING)
 logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING)
 
 
 class MoorethreadsVLLMMUSARunner(BenchmarkRunner):
-    """
-    AccelMark benchmark runner using ``vllm-musa`` on Moore Threads MUSA GPUs.
-
-    ``vllm-musa`` is registered as a vLLM platform plugin and is auto-detected
-    on ``import vllm``. The plugin activates the MUSA backend when:
-      - the plugin package is installed in the environment
-      - Moore Threads devices are visible to the process
-
-    The inference methods below are byte-for-byte identical in shape to the
-    NVIDIA vLLM runner — platform-specific logic is isolated to
-    ``_get_chip_count()``, ``load_model()``, ``get_peak_memory_gb()``, and
-    ``release_resources()``.
-    """
+    """vLLM on Moore Threads MUSA via vllm-musa."""
 
     SUPPORTS_STREAMING = True
     SUPPORTS_BATCHING = True
     SUPPORTS_ONLINE = True
-    SUPPORTS_MULTI_CHIP = True  # MCCL-based tensor parallelism on multi-card hosts
-
-    # S4000 / S5000 advertise native BF16 for LLM workloads; FP16 always works
-    # as a fallback. FP32 is left in the list for completeness but is rarely
-    # used for inference. FP8 is excluded entirely — current shipping MUSA
-    # hardware does not expose native FP8 datapaths.
-    SUPPORTED_PRECISIONS = ["bf16", "fp16", "fp32"]
-
-    # Quantization backends — start conservative. compressed-tensors is the
-    # safe default on every modern vLLM build because the kernels are pure
-    # Triton + PyTorch matmuls and so are reachable through torchada.
-    # Marlin / AWQ-CUDA / native FP8 require kernel-level validation on MUSA
-    # and should be added in a follow-up runner version after real-hardware
-    # smoke tests, not silently flipped on here.
+    SUPPORTS_MULTI_CHIP = True
+
+    SUPPORTED_PRECISIONS = ["bf16", "fp16"]
     SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors"]
 
+    _musa_runtime_prepared = False
+
     def __init__(self):
-        self.llm = None              # vllm.LLM (offline / accuracy)
-        self.engine = None           # vllm.AsyncLLMEngine (online / interactive)
+        self.llm = None
+        self.engine = None
         self.tokenizer = None
         self.sampling_params = None
         self._loop: asyncio.AbstractEventLoop = None
 
-    # ── Metadata ─────────────────────────────────────────────────────────────
-
     def _get_chip_count(self) -> int:
-        """Return the number of available Moore Threads MUSA GPUs.
-
-        Preference order:
-          1. ``pymtml`` (the Moore Threads management library, equivalent to
-             nvidia-ml-py). Most reliable because it queries the driver
-             directly and is not affected by ``MUSA_VISIBLE_DEVICES`` if
-             called before any ``torch`` initialisation.
-          2. ``torch.cuda.device_count()`` — torchada aliases ``torch.cuda``
-             to MUSA so this returns the visible MUSA device count in the
-             current process (respecting ``MUSA_VISIBLE_DEVICES``).
-        """
         try:
             import pymtml
             pymtml.mtmlInit()
@@ -145,7 +58,6 @@ def _get_chip_count(self) -> int:
                 return int(n)
         except Exception:
             pass
-
         try:
             import torch
             n = torch.cuda.device_count()
@@ -154,17 +66,9 @@ def _get_chip_count(self) -> int:
             return 1
 
     def _get_framework_name(self) -> str:
-        # The leaderboard groups by framework string; keep this distinct from
-        # plain "vLLM" so MUSA results are not silently mixed with CUDA results.
         return "vllm-musa"
 
     def _get_framework_version(self) -> str:
-        """Report vllm-musa plugin version, with vLLM core version appended.
-
-        The plugin version is the meaningful identifier (it pins the patch
-        set), but the underlying vLLM core version is what generates kernels
-        and parses configs. Reporting both makes results reproducible.
-        """
         plugin_version = "unknown"
         try:
             from importlib.metadata import version
@@ -175,14 +79,11 @@ def _get_framework_version(self) -> str:
                 plugin_version = getattr(vllm_musa_platform, "__version__", "unknown")
             except Exception:
                 pass
-
-        core_version = "unknown"
         try:
             import vllm
             core_version = vllm.__version__
         except Exception:
-            pass
-
+            core_version = "unknown"
         if plugin_version == "unknown" and core_version == "unknown":
             return "unknown"
         if plugin_version == "unknown":
@@ -192,20 +93,44 @@ def _get_framework_version(self) -> str:
     def get_model_format(self) -> str:
         return "HuggingFace original"
 
-    # ── Model loading ────────────────────────────────────────────────────────
+    @classmethod
+    def _prepare_musa_runtime(cls) -> None:
+        if cls._musa_runtime_prepared:
+            return
+        import torch  # noqa: F401
+        cls._musa_runtime_prepared = True
 
-    def load_model(self, model_path: str, parallelism: dict) -> None:
-        """
-        Load model onto Moore Threads MUSA GPU(s) via vllm-musa.
+    @staticmethod
+    def _legacy_vllm_musa() -> bool:
+        try:
+            import vllm
+            ver = vllm.__version__.split("+")[0]
+            major, minor = (int(x) for x in ver.split(".")[:2])
+            return (major, minor) < (0, 10)
+        except Exception:
+            return True
 
-        vllm-musa uses the standard vLLM ``LLM`` / ``AsyncLLMEngine``
-        constructors. The MUSA backend activates automatically when the
-        plugin package is installed and Moore Threads devices are present —
-        no explicit device flag is required in engine kwargs.
+    @staticmethod
+    def _get_engine_arg_fields() -> set[str]:
+        try:
+            import dataclasses
+            from vllm.engine.arg_utils import EngineArgs
+            return {f.name for f in dataclasses.fields(EngineArgs)}
+        except Exception:
+            return set()
+
+    def _resolve_musa_dtype(self, dtype: str, precision: str) -> str:
+        if not self._legacy_vllm_musa():
+            return dtype
+        if dtype in ("bfloat16", "auto") or precision.upper() == "BF16":
+            if dtype != "float16":
+                print("  Note: vLLM 0.4.x+musa — using float16")
+            return "float16"
+        return dtype
+
+    def load_model(self, model_path: str, parallelism: dict) -> None:
+        self._prepare_musa_runtime()
 
-        Pipeline parallelism is not supported (matches the vLLM CUDA backend
-        behaviour). Use ``tensor_parallel_size`` for multi-chip runs.
-        """
         from transformers import AutoTokenizer
         from vllm import LLM, AsyncLLMEngine, SamplingParams
         from vllm.engine.arg_utils import AsyncEngineArgs
@@ -214,9 +139,7 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
         pp_size = parallelism["pipeline_parallel_size"]
         ep_size = parallelism.get("expert_parallel_size", 1)
         assert pp_size <= 1, (
-            "Pipeline parallelism (pp_size > 1) is not supported in "
-            "MoorethreadsVLLMMUSARunner. Use --tensor-parallel-size for "
-            "multi-chip runs."
+            "Pipeline parallelism is not supported. Use --tensor-parallel-size."
         )
 
         max_tokens = parallelism["max_tokens"]
@@ -226,33 +149,22 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
 
         cfg = getattr(self, "_runner_config", {})
         max_num_seqs = cfg.get("max_num_seqs", 256)
-        # vLLM's flag name is gpu_memory_utilization, but on MUSA it controls
-        # the per-card HBM fraction reserved for the KV cache. We keep the
-        # vLLM name to stay schema-compatible with other runners' configs.
         musa_memory_util = cfg.get("gpu_memory_utilization", 0.85)
         extra_kwargs = dict(cfg.get("engine_kwargs") or {})
 
-        # Filter engine_kwargs to only fields the installed vLLM version
-        # accepts. EngineArgs is a strict dataclass — unknown kwargs raise
-        # TypeError at construction. vllm-musa supports vLLM 0.10.x and 0.13.x,
-        # whose EngineArgs fields differ slightly; filtering keeps the YAML
-        # forward-compatible.
-        try:
-            import dataclasses
-            from vllm.engine.arg_utils import EngineArgs as _EngineArgs
-            _valid = {f.name for f in dataclasses.fields(_EngineArgs)}
-            _dropped = {k: v for k, v in extra_kwargs.items() if k not in _valid}
+        _valid_engine_fields = self._get_engine_arg_fields()
+        if _valid_engine_fields:
+            _dropped = {k: v for k, v in extra_kwargs.items()
+                        if k not in _valid_engine_fields}
             if _dropped:
                 print(f"  Warning: engine_kwargs keys not supported by this "
                       f"vllm-musa / vLLM version and will be ignored: "
                       f"{list(_dropped)}")
-            extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid}
-        except Exception:
-            pass
+            extra_kwargs = {k: v for k, v in extra_kwargs.items()
+                            if k in _valid_engine_fields}
 
         effective_precision = getattr(self, "_effective_precision", "BF16").upper()
         precision = getattr(self, "_precision", None) or effective_precision
-
         _dtype_override = getattr(self, "_precision_dtype_override", None)
         _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {})
         quantization = _prec_eng_kwargs.pop("quantization", None)
@@ -263,13 +175,11 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
 
         if _dtype_override:
             dtype = _dtype_override
+        dtype = self._resolve_musa_dtype(dtype, precision)
         if _prec_eng_kwargs:
             _prec_eng_kwargs.update(extra_kwargs)
             extra_kwargs = _prec_eng_kwargs
 
-        # Translate the runner's flat speculative-decoding keys into the
-        # dict-form ``speculative_config`` used by recent vLLM versions. Skip
-        # if the user already provided ``speculative_config`` directly.
         if "speculative_model" in extra_kwargs and "speculative_config" not in extra_kwargs:
             extra_kwargs["speculative_config"] = {
                 "model": extra_kwargs.pop("speculative_model"),
@@ -288,11 +198,7 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_path, trust_remote_code=False
         )
-
-        self.sampling_params = SamplingParams(
-            max_tokens=max_tokens,
-            temperature=0.0,
-        )
+        self.sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
 
         base_kwargs = dict(
             model=model_path,
@@ -301,6 +207,8 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
             trust_remote_code=False,
             enforce_eager=enforce_eager,
         )
+        if not _valid_engine_fields or "device" in _valid_engine_fields:
+            base_kwargs["device"] = "musa"
         if ep_size > 1:
             base_kwargs["enable_expert_parallel"] = True
         if quantization:
@@ -326,25 +234,16 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
             self.engine = AsyncLLMEngine.from_engine_args(engine_args)
 
     def get_effective_dtype(self) -> Optional[str]:
-        """Report the actual compute dtype vllm-musa resolved after loading."""
         try:
             if self.llm is not None:
                 return str(self.llm.llm_engine.model_config.dtype).replace("torch.", "")
-            elif self.engine is not None:
+            if self.engine is not None:
                 return str(self.engine.engine.model_config.dtype).replace("torch.", "")
         except Exception:
             pass
         return getattr(self, "_effective_dtype", None)
 
-    # ── Inference ────────────────────────────────────────────────────────────
-
-    def inference_fn_offline(
-        self, requests: list[InferenceRequest]
-    ) -> list[InferenceResult]:
-        """
-        Synchronous batch inference via vllm-musa LLM.generate().
-        total_time_ms is wall-clock elapsed time for the full batch.
-        """
+    def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]:
         formatted = [self._format_prompt(r.prompt) for r in requests]
         t_start = time.perf_counter()
         outputs = self.llm.generate(formatted, self.sampling_params)
@@ -352,22 +251,19 @@ def inference_fn_offline(
 
         self._last_accuracy_outputs = [o.outputs[0].text for o in outputs]
 
-        results = []
-        for output in outputs:
-            results.append(InferenceResult(
+        return [
+            InferenceResult(
                 first_token_time_ms=None,
                 total_time_ms=elapsed * 1000,
-                output_tokens=len(output.outputs[0].token_ids),
-                input_tokens=len(output.prompt_token_ids),
+                output_tokens=len(o.outputs[0].token_ids),
+                input_tokens=len(o.prompt_token_ids),
                 success=True,
-                output_text=output.outputs[0].text,
-            ))
-        return results
-
-    async def inference_fn_streaming(
-        self, request: InferenceRequest
-    ) -> InferenceResult:
-        """Async streaming for TTFT — API identical to NVIDIA vLLM runner."""
+                output_text=o.outputs[0].text,
+            )
+            for o in outputs
+        ]
+
+    async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult:
         from vllm.utils import random_uuid
 
         formatted = self._format_prompt(request.prompt)
@@ -380,18 +276,14 @@ async def inference_fn_streaming(
         async for output in self.engine.generate(
             formatted, self.sampling_params, request_id
         ):
-            if (
-                first_token_time_ms is None
-                and len(output.outputs[0].token_ids) > 0
-            ):
+            if first_token_time_ms is None and len(output.outputs[0].token_ids) > 0:
                 first_token_time_ms = (time.perf_counter() - t_start) * 1000
             output_tokens = len(output.outputs[0].token_ids)
             output_text = output.outputs[0].text
 
-        total_time_ms = (time.perf_counter() - t_start) * 1000
         return InferenceResult(
             first_token_time_ms=first_token_time_ms,
-            total_time_ms=total_time_ms,
+            total_time_ms=(time.perf_counter() - t_start) * 1000,
             output_tokens=output_tokens,
             input_tokens=0,
             success=True,
@@ -399,7 +291,6 @@ async def inference_fn_streaming(
         )
 
     async def inference_fn_token_stream(self, request: InferenceRequest):
-        """Async generator yielding text deltas for serve-layer SSE."""
         from vllm.utils import random_uuid
 
         formatted = self._format_prompt(request.prompt)
@@ -415,22 +306,12 @@ async def inference_fn_token_stream(self, request: InferenceRequest):
                 yield delta
                 prev_length = len(current_text)
 
-    # ── Memory & teardown ────────────────────────────────────────────────────
-
     def get_peak_memory_gb(self) -> Optional[float]:
-        """Query peak HBM usage on the active MUSA device.
-
-        torchada aliases ``torch.cuda.max_memory_allocated()`` onto MUSA, so
-        the standard CUDA API returns peak MUSA memory. We fall back to
-        ``pymtml`` if torch is unavailable for some reason.
-        """
         try:
             import torch
             return torch.cuda.max_memory_allocated() / (1024 ** 3)
         except Exception:
             pass
-        # pymtml fallback — returns currently-used memory, not strictly peak,
-        # but useful when torch.cuda is gone.
         try:
             import pymtml
             pymtml.mtmlInit()
@@ -450,16 +331,6 @@ def get_peak_memory_gb(self) -> Optional[float]:
         return None
 
     def release_resources(self) -> None:
-        """
-        Release vllm-musa engines and MUSA memory.
-
-        Teardown order mirrors the NVIDIA runner:
-          1. Shut down async engine (if online/interactive was used)
-          2. Delete engine objects to trigger Python GC
-          3. vLLM distributed-state cleanup (cleanup_dist_env_and_memory)
-          4. MCCL / torch.distributed process group destruction
-          5. MUSA memory cache flush via torch.cuda (aliased to MUSA by torchada)
-        """
         if self.llm is not None:
             try:
                 del self.llm
@@ -479,9 +350,6 @@ def release_resources(self) -> None:
                 pass
             self.engine = None
 
-        # vLLM distributed state cleanup. cleanup_dist_env_and_memory is the
-        # same entry point as upstream vLLM — vllm-musa patches the internals
-        # but keeps the public function name.
         try:
             from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
             cleanup_dist_env_and_memory(shutdown_ray=False)
@@ -496,10 +364,6 @@ def release_resources(self) -> None:
             except Exception:
                 pass
 
-        # Destroy the active torch.distributed process group. On MUSA the
-        # backend is MCCL (Moore Threads Collective Communications Library)
-        # but is exposed through the standard torch.distributed.destroy_process_group
-        # entry point thanks to torchada.
         try:
             import torch
             if torch.distributed.is_initialized():
@@ -509,8 +373,6 @@ def release_resources(self) -> None:
 
         gc.collect()
 
-        # Flush MUSA memory cache. torch.cuda.* is aliased to MUSA by torchada,
-        # so the standard CUDA cache-management APIs work without modification.
         try:
             import torch
             torch.cuda.empty_cache()
@@ -518,10 +380,8 @@ def release_resources(self) -> None:
         except Exception:
             pass
 
-    # ── Argument parsing ─────────────────────────────────────────────────────
-
     def parse_args(self):
-        """Add vllm-musa / Moore Threads-specific CLI flags."""
+        """Add vllm-musa-specific CLI flags. Base class pre-loads runner config."""
         args = super().parse_args()
         cfg = self._runner_config
 
@@ -541,24 +401,29 @@ def parse_args(self):
         ep_size = (extra.expert_parallel_size
                    if extra.expert_parallel_size is not None
                    else cfg.get("expert_parallel_size", 1))
-
         self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False)
 
         print(f"  tensor_parallel_size = {tp_size}  [{_tp_source}]")
         if ep_size > 1:
             print(f"  expert_parallel_size = {ep_size}  [cli/yaml]")
 
+        if not self.SUPPORTS_MULTI_CHIP and tp_size > 1:
+            print(f"Warning: {self.__class__.__name__} does not support multi-chip. "
+                  f"Ignoring tensor_parallel_size={tp_size}, using 1.")
+            tp_size = 1
+            ep_size = 1
+
         self._parallelism = {
-            "tensor_parallel_size":   tp_size,
+            "tensor_parallel_size": tp_size,
             "pipeline_parallel_size": 1,
-            "expert_parallel_size":   ep_size,
-            "data_parallel_size":     1,
+            "expert_parallel_size": ep_size,
+            "data_parallel_size": 1,
         }
         self._chip_count = tp_size
+        self._precision = getattr(args, "precision", None)
         return args
 
     def get_extra_subprocess_args(self, args) -> list[str]:
-        """Forward vllm-musa / Moore Threads-specific flags to subprocesses."""
         extra = [
             "--tensor-parallel-size",
             str(self._parallelism.get("tensor_parallel_size", 1)),
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
new file mode 100644
index 0000000..86cbbf9
--- /dev/null
+++ b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""
+Standalone vllm-musa smoke test (does not use the AccelMark runner).
+
+Usage (from repo root):
+
+    python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+    python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model
+
+    MODEL_PATH=/path/to/Qwen2.5-0.5B-Instruct \\
+    python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py
+"""
+
+from __future__ import annotations
+
+import gc
+import os
+import sys
+import time
+
+import torch  # noqa: F401 — before transformers/vllm (libstdc++ load order)
+
+from vllm import LLM, SamplingParams
+
+_DEFAULT_MODEL = os.getenv("MODEL_PATH", "Qwen/Qwen2.5-0.5B-Instruct")
+
+PROMPTS = [
+    "The capital of France is",
+    "Say hello in one short sentence.",
+]
+
+
+def main() -> int:
+    model_path = sys.argv[1] if len(sys.argv) > 1 else _DEFAULT_MODEL
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=64)
+
+    print(f"Loading {model_path} ...")
+    t_load = time.perf_counter()
+    llm = LLM(
+        model=model_path,
+        device="musa",
+        dtype="float16",
+        tensor_parallel_size=1,
+        max_model_len=1024,
+        max_num_seqs=4,
+        gpu_memory_utilization=0.85,
+        trust_remote_code=False,
+    )
+    print(f"Model loaded in {time.perf_counter() - t_load:.1f}s\n")
+
+    t_infer = time.perf_counter()
+    outputs = llm.generate(PROMPTS, sampling_params)
+    print(f"Inference done in {time.perf_counter() - t_infer:.1f}s\n")
+
+    for prompt, output in zip(PROMPTS, outputs):
+        text = output.outputs[0].text
+        n_tokens = len(output.outputs[0].token_ids)
+        print(f"Prompt:  {prompt!r}")
+        print(f"Output:  {text!r}")
+        print(f"Tokens:  {n_tokens}\n")
+
+    del llm
+    gc.collect()
+    try:
+        if hasattr(torch, "musa"):
+            torch.musa.empty_cache()
+        else:
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
+    print("Done.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/runners/platforms/moorethreads.py b/runners/platforms/moorethreads.py
index 708db1b..9f55684 100644
--- a/runners/platforms/moorethreads.py
+++ b/runners/platforms/moorethreads.py
@@ -1,17 +1,13 @@
 """Moore Threads MUSA GPU platform plug-in.
 
-Moore Threads ships its own driver and management tooling:
-
-* ``mthreads-gmi`` — the moral equivalent of ``nvidia-smi`` / ``rocm-smi``.
-* ``pymtml`` — Python bindings analogous to NVML / pynvml.
-* ``torchada`` — a CUDA→MUSA compatibility shim that exposes the standard
-  ``torch.cuda`` API, with the real backend version available via
-  ``torch.version.musa``.
-
-This plug-in first tries the Python bindings (best machine-readable
-output) and falls back to scraping ``mthreads-gmi`` text output. Both
-paths are best-effort: when none of the tools are installed the plug-in
-silently reports zero accelerators and the collector moves on.
+Used by ``runners/collect_env.py`` to populate ``env_info.json``.
+
+Detection order (first non-empty wins):
+
+  1. ``pymtml`` (mthreads-ml-py) — same API as used in the vllm-musa runner
+  2. ``mthreads-gmi`` text output
+  3. ``torch`` device properties (``torch.cuda`` aliased to MUSA via torchada,
+     or native ``torch.musa`` when available)
 """
 from __future__ import annotations
 
@@ -23,8 +19,6 @@
 VENDOR_LABEL = "Moore Threads"
 PRIORITY = 60
 
-# S5000 / S4000 datacenter SKUs ship with native BF16 support; the older
-# consumer-class MTT S80/S70 cards are FP16-only.
 _BF16_SUPPORTED_HINTS = ("s5000", "s4000", "s3000")
 _NO_BF16_HINTS = ("s80", "s70", "s60", "s50")
 
@@ -40,50 +34,68 @@ def _supports_bf16(chip_name: str) -> bool:
     return True
 
 
+def _driver_version_from_smi() -> str | None:
+    try:
+        out = subprocess.check_output(
+            ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
+        )
+        m = re.search(r"Driver\s+Version\s*:\s*(\S+)", out, re.IGNORECASE)
+        if m:
+            return m.group(1)
+    except Exception:
+        pass
+    return None
+
+
 def _collect_via_pymtml() -> list[dict]:
     try:
-        import pymtml as mtml  # type: ignore[import-not-found]
+        import pymtml
     except ImportError:
         return []
 
     try:
-        mtml.mtmlInit()
+        pymtml.mtmlInit()
     except Exception:
         return []
 
+    driver = _driver_version_from_smi() or "unknown"
     accelerators: list[dict] = []
     try:
-        count = mtml.mtmlDeviceGetCount()
+        count = pymtml.mtmlDeviceGetCount()
     except Exception:
         try:
-            mtml.mtmlShutdown()
+            pymtml.mtmlShutdown()
         except Exception:
             pass
         return []
 
     for idx in range(int(count)):
         try:
-            handle = mtml.mtmlDeviceGetHandleByIndex(idx)
-            name = mtml.mtmlDeviceGetName(handle)
-            mem = mtml.mtmlDeviceGetMemoryInfo(handle)
-            total_mb = getattr(mem, "total", None) or mem.get("total", 0)
-            driver = mtml.mtmlSystemGetDriverVersion()
+            dev = pymtml.mtmlDeviceGetByIndex(idx)
+            name = pymtml.mtmlDeviceGetName(dev)
+            mem = pymtml.mtmlDeviceGetMemoryInfo(dev)
+            total_bytes = getattr(mem, "total", None)
+            if total_bytes is None and isinstance(mem, dict):
+                total_bytes = mem.get("total")
         except Exception:
             continue
+        if not isinstance(name, str):
+            name = name.decode("utf-8", "ignore")
+        memory_gb = round(int(total_bytes) / (1024 ** 3), 1) if total_bytes else None
         accelerators.append(
             {
                 "index": idx,
-                "name": name if isinstance(name, str) else name.decode("utf-8", "ignore"),
+                "name": name,
                 "vendor": VENDOR_LABEL,
-                "memory_gb": round(int(total_mb) / 1024, 1) if total_mb else None,
-                "driver_version": driver if isinstance(driver, str) else driver.decode("utf-8", "ignore"),
+                "memory_gb": memory_gb,
+                "driver_version": driver,
                 "firmware_version": None,
-                "supports_bf16": _supports_bf16(str(name)),
+                "supports_bf16": _supports_bf16(name),
             }
         )
 
     try:
-        mtml.mtmlShutdown()
+        pymtml.mtmlShutdown()
     except Exception:
         pass
 
@@ -91,12 +103,7 @@ def _collect_via_pymtml() -> list[dict]:
 
 
 def _collect_via_smi() -> list[dict]:
-    """Fallback parser for ``mthreads-gmi`` text output.
-
-    The output format mirrors nvidia-smi: a header with the driver / MUSA
-    versions followed by per-device blocks listing the product name and
-    memory usage. We only need the device name and total memory.
-    """
+    """Parse ``mthreads-gmi`` text output (mthreads-gmi 1.14+ tabular format)."""
     try:
         out = subprocess.check_output(
             ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
@@ -110,21 +117,18 @@ def _collect_via_smi() -> list[dict]:
         driver = m.group(1)
 
     accelerators: list[dict] = []
-    # Per-device rows look like:
-    #   |   0  MTT S4000                  ...     | 0000:65:00.0  Off |   ... |
-    # followed by:
-    #   |   0%   45C    P0    ... /   ... |    234MiB / 49152MiB |    ... |
+    # Example row:
+    #   0    MTT S4000      |00000000:28:00.0    |0%    4MiB(49152MiB)
     for match in re.finditer(
-        r"\|\s*(\d+)\s+(MTT\s+\S+(?:\s+\S+)?)\s*", out
+        r"^(\d+)\s+(MTT\s+\S+)\s+\|",
+        out,
+        re.MULTILINE,
     ):
         idx = int(match.group(1))
         name = match.group(2).strip()
-        # Search downstream of this match for the memory line
-        tail = out[match.end():]
-        mem_match = re.search(r"(\d+)MiB\s*/\s*(\d+)MiB", tail)
-        memory_gb = None
-        if mem_match:
-            memory_gb = round(int(mem_match.group(2)) / 1024, 1)
+        tail = out[match.end(): match.end() + 256]
+        mem_match = re.search(r"\d+MiB\((\d+)MiB\)", tail)
+        memory_gb = round(int(mem_match.group(1)) / 1024, 1) if mem_match else None
         accelerators.append(
             {
                 "index": idx,
@@ -139,23 +143,69 @@ def _collect_via_smi() -> list[dict]:
     return accelerators
 
 
+def _collect_via_torch() -> list[dict]:
+    """Fallback when management libraries are missing but torch MUSA is loaded."""
+    try:
+        import torch
+    except ImportError:
+        return []
+
+    driver = _driver_version_from_smi() or "unknown"
+    accelerators: list[dict] = []
+
+    if hasattr(torch, "musa"):
+        try:
+            count = torch.musa.device_count()
+            get_props = torch.musa.get_device_properties
+        except Exception:
+            count = 0
+            get_props = None
+    else:
+        try:
+            count = torch.cuda.device_count()
+            get_props = torch.cuda.get_device_properties
+        except Exception:
+            return []
+
+    for idx in range(int(count)):
+        try:
+            props = get_props(idx)
+            name = getattr(props, "name", None) or f"MTT GPU {idx}"
+            total = getattr(props, "total_memory", None)
+            memory_gb = round(total / (1024 ** 3), 1) if total else None
+        except Exception:
+            continue
+        accelerators.append(
+            {
+                "index": idx,
+                "name": name if isinstance(name, str) else str(name),
+                "vendor": VENDOR_LABEL,
+                "memory_gb": memory_gb,
+                "driver_version": driver,
+                "firmware_version": None,
+                "supports_bf16": _supports_bf16(str(name)),
+            }
+        )
+    return accelerators
+
+
 def collect() -> list[dict]:
-    accelerators = _collect_via_pymtml()
-    if accelerators:
-        return accelerators
-    return _collect_via_smi()
+    for fn in (_collect_via_pymtml, _collect_via_smi, _collect_via_torch):
+        accelerators = fn()
+        if accelerators:
+            return accelerators
+    return []
 
 
 def detect_runtime_version() -> str | None:
-    """Prefer torch.version.musa (most reliable when torchada is installed),
-    fall back to scraping ``mthreads-gmi`` header.
-    """
     try:
         import torch
 
         ver = getattr(torch.version, "musa", None)
         if ver:
             return f"MUSA {ver}"
+        if getattr(torch.version, "cuda", None):
+            return f"MUSA (torch.cuda shim) {torch.version.cuda}"
     except ImportError:
         pass
 
@@ -174,17 +224,43 @@ def detect_runtime_version() -> str | None:
     return None
 
 
+def detect_pcie_gen() -> str | None:
+    try:
+        out = subprocess.check_output(
+            ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
+        )
+        m = re.search(r"\|\s*(\d+)x\((\d+)x\)\s*\|", out)
+        if m:
+            return f"PCIe {m.group(1)}x/{m.group(2)}x"
+    except Exception:
+        pass
+    return None
+
+
+def detect_intra_node_interconnect() -> str | None:
+    """Moore Threads multi-GPU hosts typically use MCCL over PCIe."""
+    accels = collect()
+    if len(accels) > 1:
+        return "MCCL/PCIe"
+    return None
+
+
 def diagnostics(env: dict, accelerators: list[dict]) -> list[str]:
     notes: list[str] = []
-    if accelerators and (env.get("pytorch_version") or "") == "unknown":
+    if not accelerators:
+        notes.append(
+            "No Moore Threads MUSA GPUs detected (tried pymtml, mthreads-gmi, "
+            "and torch). Install the MUSA driver/toolkit per "
+            "https://github.com/MooreThreads/vllm-musa ."
+        )
+        return notes
+    if (env.get("pytorch_version") or "") == "unknown":
         notes.append(
-            "PyTorch (with the torchada MUSA shim) is not installed — "
-            "pytorch_version is unknown."
+            "PyTorch with MUSA support is not installed — pytorch_version is unknown."
         )
-    if accelerators and (env.get("runtime_version") or "") == "unknown":
+    if (env.get("runtime_version") or "") == "unknown":
         notes.append(
             "Could not detect MUSA runtime (tried torch.version.musa and "
-            "mthreads-gmi). runtime_version is unknown — install torchada "
-            "or the Moore Threads MUSA toolkit."
+            "mthreads-gmi). runtime_version is unknown."
         )
     return notes

From 03e30bddd5899b596eadec2283da6f56f632ce8b Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Mon, 18 May 2026 18:05:54 +0800
Subject: [PATCH 3/5] add moore schema

---
 schema/env.schema.json    | 2 +-
 schema/result.schema.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/schema/env.schema.json b/schema/env.schema.json
index 60fc5e8..e80cd94 100644
--- a/schema/env.schema.json
+++ b/schema/env.schema.json
@@ -16,7 +16,7 @@
         "properties": {
           "index": { "type": "integer" },
           "name": { "type": "string" },
-          "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple'" },
+          "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple', 'Moore Threads'" },
           "memory_gb": { "type": ["number","null"], "minimum": 0 },
           "driver_version": { "type": "string" },
           "firmware_version": { "type": ["string","null"] },
diff --git a/schema/result.schema.json b/schema/result.schema.json
index 99a0517..fb81a8a 100644
--- a/schema/result.schema.json
+++ b/schema/result.schema.json
@@ -36,7 +36,7 @@
         "vendor": {
           "type": "string",
           "enum": ["NVIDIA","AMD","Intel","Google","Huawei","Cambricon","Biren",
-                   "Enflame","MetaX","Iluvatar","Apple","Qualcomm","Other"]
+                   "Enflame","MetaX","Moore Threads","Iluvatar","Apple","Qualcomm","Other"]
         },
         "count": { "type": "integer", "minimum": 1 },
         "memory_gb": { "type": "number", "minimum": 0 },

From 7e8c3df569fab441331e0aae0788f8b0d460d2e6 Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Mon, 18 May 2026 18:09:21 +0800
Subject: [PATCH 4/5] upload moore results

---
 .../accuracy/accuracy.json                    |   8 +
 .../env_info.json                             |  48 ++++
 .../offline/result.json                       | 164 +++++++++++++
 .../online/result.json                        | 163 +++++++++++++
 .../result.json                               | 215 ++++++++++++++++++
 .../accuracy/accuracy.json                    |   8 +
 .../env_info.json                             |  48 ++++
 .../interactive/result.json                   | 131 +++++++++++
 .../offline/result.json                       | 164 +++++++++++++
 .../online/result.json                        | 151 ++++++++++++
 .../result.json                               | 215 ++++++++++++++++++
 11 files changed, 1315 insertions(+)
 create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json
 create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json
 create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json
 create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json
 create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json
 create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json
 create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json
 create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json
 create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json
 create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json
 create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json

diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json
new file mode 100644
index 0000000..7242234
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.07,
+  "baseline_delta": -0.53,
+  "valid": false,
+  "framework": "vllm-musa",
+  "precision": "BF16",
+  "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json
new file mode 100644
index 0000000..4244ef7
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json
@@ -0,0 +1,48 @@
+{
+  "collected_at": "2026-05-18T09:21:31.092840+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "MTT S4000",
+      "vendor": "Moore Threads",
+      "memory_gb": 48.0,
+      "driver_version": "2.7.0",
+      "firmware_version": null,
+      "supports_bf16": true
+    }
+  ],
+  "accelerator_platform": "moorethreads",
+  "accelerator_topology": null,
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6430",
+    "physical_cores": 64,
+    "logical_cores": 128,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 1007.5,
+  "pcie_generation": "PCIe 16x/16x",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": [
+    {
+      "name": "mlx5_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_1",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_bond_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    }
+  ],
+  "os": "Ubuntu Jammy Jellyfish (development branch)",
+  "python_version": "3.10.8",
+  "kernel_version": "5.15.0-105-generic",
+  "runtime_version": "Moore Threads Driver 2.7.0",
+  "pytorch_version": "2.2.0"
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json
new file mode 100644
index 0000000..a050fe4
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json
@@ -0,0 +1,164 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:21:31.092840+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 332.62,
+          "throughput_tokens_per_sec_per_chip": 332.62,
+          "throughput_tokens_per_sec_total": 922.83,
+          "elapsed_seconds_median": 43.4,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 331.64,
+          "throughput_tokens_per_sec_per_chip": 331.64,
+          "throughput_tokens_per_sec_total": 920.1,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 331.76,
+          "throughput_tokens_per_sec_per_chip": 331.76,
+          "throughput_tokens_per_sec_total": 920.46,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "17:34:52",
+    "run_id": "cabb7bd0",
+    "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00",
+    "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00",
+    "benchmark_elapsed_minutes": 8.7,
+    "model_load_seconds": 116.8
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json
new file mode 100644
index 0000000..064d6b8
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json
@@ -0,0 +1,163 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:21:31.092840+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "online",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 5,
+      "results_by_qps": [
+        {
+          "target_qps": 5,
+          "achieved_qps": 5.0,
+          "ttft_ms_p50": 194.45,
+          "ttft_ms_p90": 315.05,
+          "ttft_ms_p99": 424.55,
+          "tpot_ms_p50": 201.93,
+          "tpot_ms_p90": 253.8,
+          "tpot_ms_p99": 471.28,
+          "elapsed_seconds_median": 137.6,
+          "sla_met": true
+        },
+        {
+          "target_qps": 25,
+          "achieved_qps": 25.0,
+          "ttft_ms_p50": 4796.14,
+          "ttft_ms_p90": 8459.18,
+          "ttft_ms_p99": 9348.86,
+          "tpot_ms_p50": 355.01,
+          "tpot_ms_p90": 6430.04,
+          "tpot_ms_p99": 15579.83,
+          "elapsed_seconds_median": 93.0,
+          "sla_met": false
+        },
+        {
+          "target_qps": 100,
+          "achieved_qps": 100.0,
+          "ttft_ms_p50": 10354.27,
+          "ttft_ms_p90": 17651.16,
+          "ttft_ms_p99": 19078.89,
+          "tpot_ms_p50": 849.82,
+          "tpot_ms_p90": 8677.79,
+          "tpot_ms_p99": 14281.03,
+          "elapsed_seconds_median": 90.0,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "17:53:38",
+    "run_id": "cabb7bd0",
+    "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:37:13.745117+00:00",
+    "benchmark_end_time": "2026-05-18T09:53:38.865501+00:00",
+    "benchmark_elapsed_minutes": 16.4,
+    "model_load_seconds": 122.7
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json
new file mode 100644
index 0000000..e4b1093
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json
@@ -0,0 +1,215 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:21:31.092840+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenarios_run": [
+      "offline",
+      "online"
+    ],
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "num_runs": 3,
+    "extra_config": null
+  },
+  "metrics": {
+    "derived": {},
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 332.62,
+          "throughput_tokens_per_sec_per_chip": 332.62,
+          "throughput_tokens_per_sec_total": 922.83,
+          "elapsed_seconds_median": 43.4,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 331.64,
+          "throughput_tokens_per_sec_per_chip": 331.64,
+          "throughput_tokens_per_sec_total": 920.1,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 331.76,
+          "throughput_tokens_per_sec_per_chip": 331.76,
+          "throughput_tokens_per_sec_total": 920.46,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    },
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 5,
+      "results_by_qps": [
+        {
+          "target_qps": 5,
+          "achieved_qps": 5.0,
+          "ttft_ms_p50": 194.45,
+          "ttft_ms_p90": 315.05,
+          "ttft_ms_p99": 424.55,
+          "tpot_ms_p50": 201.93,
+          "tpot_ms_p90": 253.8,
+          "tpot_ms_p99": 471.28,
+          "elapsed_seconds_median": 137.6,
+          "sla_met": true
+        },
+        {
+          "target_qps": 25,
+          "achieved_qps": 25.0,
+          "ttft_ms_p50": 4796.14,
+          "ttft_ms_p90": 8459.18,
+          "ttft_ms_p99": 9348.86,
+          "tpot_ms_p50": 355.01,
+          "tpot_ms_p90": 6430.04,
+          "tpot_ms_p99": 15579.83,
+          "elapsed_seconds_median": 93.0,
+          "sla_met": false
+        },
+        {
+          "target_qps": 100,
+          "achieved_qps": 100.0,
+          "ttft_ms_p50": 10354.27,
+          "ttft_ms_p90": 17651.16,
+          "ttft_ms_p99": 19078.89,
+          "tpot_ms_p50": 849.82,
+          "tpot_ms_p90": 8677.79,
+          "tpot_ms_p99": 14281.03,
+          "elapsed_seconds_median": 90.0,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": 0.07,
+    "baseline_delta": -0.53,
+    "valid": false,
+    "framework": "vllm-musa",
+    "precision": "BF16",
+    "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "17:34:52",
+    "run_id": "cabb7bd0",
+    "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": "Partial run: ['offline', 'online'] succeeded, ['accuracy'] failed.",
+    "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00",
+    "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00",
+    "benchmark_elapsed_minutes": 25.1,
+    "model_load_seconds": 116.8,
+    "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.",
+    "scenario_dirs": {
+      "offline": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline",
+      "online": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online"
+    }
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json
new file mode 100644
index 0000000..63c6e92
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.07,
+  "baseline_delta": -0.31,
+  "valid": false,
+  "framework": "vllm-musa",
+  "precision": "BF16",
+  "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json
new file mode 100644
index 0000000..31f501b
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json
@@ -0,0 +1,48 @@
+{
+  "collected_at": "2026-05-18T08:40:55.208034+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "MTT S4000",
+      "vendor": "Moore Threads",
+      "memory_gb": 48.0,
+      "driver_version": "2.7.0",
+      "firmware_version": null,
+      "supports_bf16": true
+    }
+  ],
+  "accelerator_platform": "moorethreads",
+  "accelerator_topology": null,
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6430",
+    "physical_cores": 64,
+    "logical_cores": 128,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 1007.5,
+  "pcie_generation": "PCIe 16x/16x",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": [
+    {
+      "name": "mlx5_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_1",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_bond_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    }
+  ],
+  "os": "Ubuntu Jammy Jellyfish (development branch)",
+  "python_version": "3.10.8",
+  "kernel_version": "5.15.0-105-generic",
+  "runtime_version": "Moore Threads Driver 2.7.0",
+  "pytorch_version": "2.2.0"
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json
new file mode 100644
index 0000000..4f5ff81
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json
@@ -0,0 +1,131 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T08:40:55.208034+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "interactive",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "interactive": {
+      "ttft_ms_p50": 25.89,
+      "ttft_ms_p90": 27.18,
+      "ttft_ms_p99": 28.51,
+      "tpot_ms_p50": 14.85,
+      "tpot_ms_p90": 15.17,
+      "tpot_ms_p99": 15.5,
+      "peak_memory_gb": null,
+      "elapsed_seconds_median": 481.4
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "17:21:09",
+    "run_id": "4f66d29d",
+    "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T08:56:46.686185+00:00",
+    "benchmark_end_time": "2026-05-18T09:21:09.800661+00:00",
+    "benchmark_elapsed_minutes": 24.4,
+    "model_load_seconds": 151.2
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json
new file mode 100644
index 0000000..2498167
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json
@@ -0,0 +1,164 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T08:40:55.208034+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 4,
+          "throughput_tokens_per_sec": 1994.51,
+          "throughput_tokens_per_sec_per_chip": 1994.51,
+          "throughput_tokens_per_sec_total": 3642.41,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 16,
+          "throughput_tokens_per_sec": 1998.44,
+          "throughput_tokens_per_sec_per_chip": 1998.44,
+          "throughput_tokens_per_sec_total": 3649.59,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 64,
+          "throughput_tokens_per_sec": 2004.02,
+          "throughput_tokens_per_sec_per_chip": 2004.02,
+          "throughput_tokens_per_sec_total": 3659.77,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "16:48:27",
+    "run_id": "4f66d29d",
+    "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00",
+    "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00",
+    "benchmark_elapsed_minutes": 2.5,
+    "model_load_seconds": 146.8
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json
new file mode 100644
index 0000000..eb13372
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json
@@ -0,0 +1,151 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T08:40:55.208034+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "online",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 40,
+      "results_by_qps": [
+        {
+          "target_qps": 10,
+          "achieved_qps": 10.0,
+          "ttft_ms_p50": 47.68,
+          "ttft_ms_p90": 96.31,
+          "ttft_ms_p99": 956.22,
+          "tpot_ms_p50": 47.25,
+          "tpot_ms_p90": 80.82,
+          "tpot_ms_p99": 131.63,
+          "elapsed_seconds_median": 37.8,
+          "sla_met": false
+        },
+        {
+          "target_qps": 40,
+          "achieved_qps": 40.0,
+          "ttft_ms_p50": 94.5,
+          "ttft_ms_p90": 194.64,
+          "ttft_ms_p99": 331.88,
+          "tpot_ms_p50": 74.76,
+          "tpot_ms_p90": 287.01,
+          "tpot_ms_p99": 444.19,
+          "elapsed_seconds_median": 19.0,
+          "sla_met": true
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "16:53:54",
+    "run_id": "4f66d29d",
+    "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T08:51:01.188901+00:00",
+    "benchmark_end_time": "2026-05-18T08:53:54.250762+00:00",
+    "benchmark_elapsed_minutes": 2.9,
+    "model_load_seconds": 132.6
+  }
+}
\ No newline at end of file
diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json
new file mode 100644
index 0000000..a1c073d
--- /dev/null
+++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json
@@ -0,0 +1,215 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T08:40:55.208034+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenarios_run": [
+      "offline",
+      "online",
+      "interactive"
+    ],
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "num_runs": 3,
+    "extra_config": null
+  },
+  "metrics": {
+    "derived": {},
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 4,
+          "throughput_tokens_per_sec": 1994.51,
+          "throughput_tokens_per_sec_per_chip": 1994.51,
+          "throughput_tokens_per_sec_total": 3642.41,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 16,
+          "throughput_tokens_per_sec": 1998.44,
+          "throughput_tokens_per_sec_per_chip": 1998.44,
+          "throughput_tokens_per_sec_total": 3649.59,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 64,
+          "throughput_tokens_per_sec": 2004.02,
+          "throughput_tokens_per_sec_per_chip": 2004.02,
+          "throughput_tokens_per_sec_total": 3659.77,
+          "elapsed_seconds_median": 12.5,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    },
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 40,
+      "results_by_qps": [
+        {
+          "target_qps": 10,
+          "achieved_qps": 10.0,
+          "ttft_ms_p50": 47.68,
+          "ttft_ms_p90": 96.31,
+          "ttft_ms_p99": 956.22,
+          "tpot_ms_p50": 47.25,
+          "tpot_ms_p90": 80.82,
+          "tpot_ms_p99": 131.63,
+          "elapsed_seconds_median": 37.8,
+          "sla_met": false
+        },
+        {
+          "target_qps": 40,
+          "achieved_qps": 40.0,
+          "ttft_ms_p50": 94.5,
+          "ttft_ms_p90": 194.64,
+          "ttft_ms_p99": 331.88,
+          "tpot_ms_p50": 74.76,
+          "tpot_ms_p90": 287.01,
+          "tpot_ms_p99": 444.19,
+          "elapsed_seconds_median": 19.0,
+          "sla_met": true
+        }
+      ]
+    },
+    "interactive": {
+      "ttft_ms_p50": 25.89,
+      "ttft_ms_p90": 27.18,
+      "ttft_ms_p99": 28.51,
+      "tpot_ms_p50": 14.85,
+      "tpot_ms_p90": 15.17,
+      "tpot_ms_p99": 15.5,
+      "peak_memory_gb": null,
+      "elapsed_seconds_median": 481.4
+    }
+  },
+  "accuracy": {
+    "subset_score": 0.07,
+    "baseline_delta": -0.31,
+    "valid": false,
+    "framework": "vllm-musa",
+    "precision": "BF16",
+    "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "16:48:27",
+    "run_id": "4f66d29d",
+    "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": "Partial run: ['offline', 'online', 'interactive'] succeeded, ['accuracy'] failed.",
+    "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00",
+    "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00",
+    "benchmark_elapsed_minutes": 29.8,
+    "model_load_seconds": 146.8,
+    "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.",
+    "scenario_dirs": {
+      "offline": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline",
+      "online": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online",
+      "interactive": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive"
+    }
+  }
+}
\ No newline at end of file

From d2e78a253e8686e5beb62c9228ea2ec8058f1745 Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Mon, 18 May 2026 18:19:24 +0800
Subject: [PATCH 5/5] update

---
 README.md                                         | 2 +-
 runners/moorethreads_vllm_musa_f2f6f965/README.md | 4 ++--
 runners/moorethreads_vllm_musa_f2f6f965/meta.json | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 2ca3d64..3007966 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
 | Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — |
-| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — |
+| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ✓ | ⋯ | ⋯ | ⋯ | ⋯ | ✓ | — |
 
 _Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._
 <!-- platforms-matrix:end -->
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/README.md b/runners/moorethreads_vllm_musa_f2f6f965/README.md
index e963d18..5111bdc 100644
--- a/runners/moorethreads_vllm_musa_f2f6f965/README.md
+++ b/runners/moorethreads_vllm_musa_f2f6f965/README.md
@@ -7,12 +7,12 @@ AccelMark runner for Moore Threads MUSA GPUs using
 
 | Suite | Description | Notes |
 |-------|-------------|-------|
-| Suite A | Single-chip, Llama-3-8B | Smoke tested on MTT S4000; accuracy not at baseline on vLLM 0.4.x |
+| Suite A | Single-chip, Llama-3-8B | Validated on S4000 (default: accuracy/offline/online) |
 | Suite B | Multi-chip, Llama-3-70B | MCCL tensor parallelism; set `VLLM_WORKER_MULTIPROC_METHOD=spawn` |
 | Suite C | Quantization, Llama-3.1-8B | FP8 skipped (not supported); W8A8/W8A16 via compressed-tensors |
 | Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` / `gpu_memory_utilization` in runner config |
 | Suite E | Multi-chip scaling, Llama-3-8B | MCCL tensor parallelism |
-| Suite F | Edge, Qwen2.5-0.5B | Smoke tested on MTT S4000; recommended first run |
+| Suite F | Edge, Qwen2.5-0.5B | Validated on MTT S4000 (community result in repo) |
 | Suite G | MoE multi-chip, Mixtral-8x7B | Unsupported |
 
 ## Hardware compatibility
diff --git a/runners/moorethreads_vllm_musa_f2f6f965/meta.json b/runners/moorethreads_vllm_musa_f2f6f965/meta.json
index 9d5728d..e57d72d 100644
--- a/runners/moorethreads_vllm_musa_f2f6f965/meta.json
+++ b/runners/moorethreads_vllm_musa_f2f6f965/meta.json
@@ -6,16 +6,16 @@
   "submitted_by": "JuhaoLiang1997",
   "description": "AccelMark runner for Moore Threads MUSA GPUs using vllm-musa (https://github.com/MooreThreads/vllm-musa). Install torch/vllm per upstream README_vllm_musa.md; requirements.txt adds benchmark deps only. Sets device=musa; BF16 maps to float16 on vLLM <0.10. MCCL tensor parallelism. FP8 unsupported.",
   "supersedes_chain": [],
-  "notes": "MMLU not at baseline on tested vLLM 0.4.x+musa stack — see runner README.",
+  "notes": "Smoke-tested on MTT S4000 (vLLM 0.4.2+musa): Suite A and F default scenarios run. MMLU not at baseline — see runner README.",
   "created": "2026-05-18",
   "hardware_label": null,
   "suite_support": {
-    "A": "pending",
+    "A": "validated",
     "B": "pending",
     "C": "pending",
     "D": "pending",
     "E": "pending",
-    "F": "pending",
+    "F": "validated",
     "G": "unsupported"
   }
 }