From 99e4a06be3d13bd13bd340c2e43b24fcd7169406 Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Fri, 15 May 2026 11:08:00 +0800 Subject: [PATCH 1/5] =?UTF-8?q?feat:=20add=20Moore=20Threads=20MUSA=20runn?= =?UTF-8?q?er=20(S5000/S4000)=20=E2=80=94=20moorethreads=5Fvllm=5Fmusa=5F5?= =?UTF-8?q?7ff5443?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the AccelMark runner skeleton for Moore Threads MTT S5000 / S4000 GPUs via the official vllm-musa platform plugin. The plugin auto-patches vLLM at import time (torchada CUDA→MUSA aliasing + pymtml + Triton patches), so the standard vLLM Python API is preserved and the runner mirrors the structure of ascend_vllm_ascend. What is included: * runners/moorethreads_vllm_musa_57ff5443/ — runner.py, meta.json (with suite_support self-declaration), requirements.txt, README.md * configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example The README platforms matrix updates automatically from the runner's meta.json (no hand-editing required, thanks to the onboarding decoupling that landed in the preceding commit). The Moore Threads environment detector also already lives at runners/platforms/moorethreads.py in the same earlier commit. Notes: * Capability flags are conservative: SUPPORTED_QUANTIZATION_BACKENDS only declares compressed-tensors; FP8 / AWQ / GPTQ-Marlin will be enabled in a follow-up runner version once real-hardware smoke tests confirm kernel coverage on MUSA. * This code has not yet been validated on physical S5000 / S4000 silicon; all suites are marked "pending" in suite_support and smoke testing will land as a new runner folder with a fresh hash. Co-authored-by: Cursor --- README.md | 1 + ...orethreads_vllm_musa_57ff5443.yaml.example | 62 ++ .../moorethreads_vllm_musa_57ff5443/README.md | 200 ++++++ .../moorethreads_vllm_musa_57ff5443/meta.json | 21 + .../requirements.txt | 58 ++ .../moorethreads_vllm_musa_57ff5443/runner.py | 575 ++++++++++++++++++ 6 files changed, 917 insertions(+) create mode 100644 configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example create mode 100644 runners/moorethreads_vllm_musa_57ff5443/README.md create mode 100644 runners/moorethreads_vllm_musa_57ff5443/meta.json create mode 100644 runners/moorethreads_vllm_musa_57ff5443/requirements.txt create mode 100644 runners/moorethreads_vllm_musa_57ff5443/runner.py diff --git a/README.md b/README.md index ea9e2b6..92cec27 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — | | Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — | +| Moore Threads GPU | `moorethreads_vllm_musa_57ff5443` | vllm-musa | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | _Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._ diff --git a/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example b/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example new file mode 100644 index 0000000..5c8f878 --- /dev/null +++ b/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example @@ -0,0 +1,62 @@ +# AccelMark runner config — moorethreads_vllm_musa_57ff5443 (vllm-musa on Moore Threads) +# +# Copy this file to runner_moorethreads_vllm_musa_57ff5443.yaml (remove +# .example suffix) and edit as needed for your hardware. The actual .yaml +# is gitignored. +# +# These settings adapt the runner to your hardware environment. They are +# recorded in result.json task.extra_config for transparency but are NOT +# part of the benchmark identity (not hashed into run_id). +# +# Merge priority: CLI flags > suite-specific > global defaults > runner defaults + +# ── Global defaults (apply to all suites) ───────────────────────────────────── + +# Tensor parallel size — number of Moore Threads GPUs to use (default: 1). +# For multi-card runs make sure to export VLLM_WORKER_MULTIPROC_METHOD=spawn. +tensor_parallel_size: 1 + +# Disable Triton CUDA-graph / compilation. Set true if you hit Triton kernel +# errors on first request (most common on S3000 / S80 paths). +enforce_eager: false + +# Maximum number of sequences in a batch (default: 256). +# Reduce on lower-memory cards: 128 on 24 GB cards, 64 on 16 GB cards. +max_num_seqs: 256 + +# Fraction of MUSA HBM reserved for the KV cache (default: 0.85). Reduce if +# you hit OOM; the vLLM flag is named gpu_memory_utilization but applies to +# MUSA HBM via torchada. +gpu_memory_utilization: 0.85 + +# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs(). +# Unknown keys are dropped automatically with a warning, so this is safe to +# use across vLLM 0.10.x / 0.13.x. +# engine_kwargs: +# swap_space: 8 +# max_seq_len_to_capture: 4096 + +# ── Suite-specific overrides ─────────────────────────────────────────────────── + +suites: + suite_D: + # Long-context — reduce batch size and reserve more memory. + max_num_seqs: 32 + gpu_memory_utilization: 0.80 + + suite_F: + # Consumer / edge GPU — enforce_eager often safer for first runs. + # enforce_eager: true + max_num_seqs: 128 + +# ── Speculative decoding (suite_A / suite_D extra scenario) ───────────────── +# Uncomment to enable. vllm-musa accepts the same speculative_config dict as +# upstream vLLM; the runner translates flat keys (speculative_model, +# num_speculative_tokens, ...) into speculative_config automatically. +# +# suites: +# suite_A: +# engine_kwargs: +# speculative_model: "meta-llama/Llama-3.2-1B-Instruct" +# num_speculative_tokens: 4 +# speculative_draft_tensor_parallel_size: 1 diff --git a/runners/moorethreads_vllm_musa_57ff5443/README.md b/runners/moorethreads_vllm_musa_57ff5443/README.md new file mode 100644 index 0000000..82411a1 --- /dev/null +++ b/runners/moorethreads_vllm_musa_57ff5443/README.md @@ -0,0 +1,200 @@ +# moorethreads_vllm_musa_57ff5443 — Moore Threads MUSA Runner (vllm-musa) + +AccelMark runner for Moore Threads MUSA GPUs using +[vllm-musa](https://github.com/MooreThreads/vllm-musa), the official vLLM +platform plugin for MUSA hardware. + +> **Status:** This runner is **untested on real silicon at the time of +> commit**. The code is written against the public `vllm-musa` plugin +> documentation and follows the structural template of the +> `ascend_vllm_ascend_*` runner. Plan to smoke-test on an S5000 / S4000 +> system; capability flags and dtype mappings may be adjusted in a follow-up +> runner version (new hash, new folder) based on real-world findings. + +## How vllm-musa works + +`vllm-musa` is a vLLM **platform plugin** (auto-detected on `import vllm`) +that makes the standard vLLM Python API run on Moore Threads MUSA GPUs. It +relies on three components: + +| Component | Role | +|---|---| +| `torchada` | CUDA→MUSA compatibility layer for PyTorch — aliases `torch.cuda.*` to MUSA so most code paths run unmodified | +| `pymtml` (`mthreads-ml-py`) | Moore Threads Management Library bindings, equivalent to `nvidia-ml-py` | +| Triton patches | Runtime monkey-patches in `vllm_musa_platform.patches.*` that fix `triton.attention` and `worker` modules for MUSA's Triton compiler | + +The standard `vllm.LLM`, `vllm.AsyncLLMEngine`, and `vllm.SamplingParams` +remain the entry points — this runner therefore reuses ~95% of the logic +from the NVIDIA / Ascend vLLM runners. + +## Supported suites + +| Suite | Description | Notes | +|-------|-------------|-------| +| Suite A | Single-chip, Llama-3-8B | Pending smoke test on S4000 / S5000 | +| Suite B | Multi-chip, Llama-3-70B | Requires multiple Moore Threads cards + MCCL TP | +| Suite C | Quantization, Llama-3.1-8B | FP8 skipped (no native FP8 in current MUSA hardware); compressed-tensors W8A8/W8A16 candidate; AWQ / GPTQ pending validation | +| Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` and `gpu_memory_utilization` | +| Suite E | Multi-chip scaling, Llama-3-8B | Validates MCCL tensor parallelism | +| Suite F | Consumer/edge, Qwen2.5-0.5B | Recommended starting point for S4000 single-card systems | + +## Hardware compatibility + +| GPU | BF16 | TP via MCCL | FP8 | Notes | +|-----|------|-------------|-----|-------| +| MTT S5000 | ✅ | ✅ | ❌ | Recommended public reference target (FA3 via MATE) | +| MTT S4000 | ✅ | ✅ | ❌ | Validated path with PyTorch SDPA-based FlashAttention | +| MTT S3000 | ⚠️ | ⚠️ | ❌ | May work via `--enforce-eager`; not the public reference | +| MTT S80 | ⚠️ | — | ❌ | Consumer card; treat as best-effort | + +## Prerequisites + +You must install the MUSA stack in this exact order — Python packages alone +are not sufficient: + +**1. MUSA toolkit + driver** + +Match the toolkit version to your card firmware. Reference: + + +**2. PyTorch with MUSA support (torch + torchada)** + +The recommended path is the official Moore Threads container, which ships a +pre-built `torch==2.7.1` together with `torchada` and `pymtml`. See: + +```bash +docker pull sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120 +``` + +**3. Runner dependencies** + +Inside the MUSA container: + +```bash +pip install -r runners/moorethreads_vllm_musa_57ff5443/requirements.txt +``` + +This installs `vllm-musa==0.1.1` which auto-pulls a validated vLLM core +(`0.10.1.1` by default). To use vLLM `0.13.0` instead (V1-only engine): + +```bash +pip install vllm==0.13.0 --no-deps --upgrade +pip install 'depyf==0.20.0' 'llguidance>=1.3.0,<1.4.0' \ + 'lm-format-enforcer==0.11.3' 'outlines_core==0.2.11' \ + 'xgrammar==0.1.27' 'compressed-tensors==0.12.2' +``` + +## Required environment variables + +```bash +# Device visibility (works like CUDA_VISIBLE_DEVICES) +export MUSA_VISIBLE_DEVICES=0,1,2,3 + +# Recommended for multi-process workers (TP > 1) +export VLLM_WORKER_MULTIPROC_METHOD=spawn +``` + +## Basic usage + +```bash +# Verify the plugin is loaded before running anything else +python -c "from vllm_musa_platform import musa_platform_plugin; print('ok')" + +# Suite F (single-card S4000 / S5000) +python run.py --runner moorethreads_vllm_musa_57ff5443 --suite suite_F + +# Suite A (single-card datacenter benchmark) +python run.py --runner moorethreads_vllm_musa_57ff5443 --suite suite_A + +# Multi-card tensor parallelism (e.g. 8 x S5000 on a single host) +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +python run.py --runner moorethreads_vllm_musa_57ff5443 \ + --suite suite_B \ + --tensor-parallel-size 8 + +# Local model cache +python run.py --runner moorethreads_vllm_musa_57ff5443 \ + --suite suite_A \ + --model-path /data/models/Meta-Llama-3-8B-Instruct +``` + +## Runner config + +Copy the example config and adjust for your hardware: + +```bash +cp configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example \ + configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml +``` + +Key settings: + +| Field | Default | Notes | +|-------|---------|-------| +| `tensor_parallel_size` | 1 | Number of MUSA GPUs for tensor parallelism | +| `enforce_eager` | false | Disable CUDA-graph / compilation; useful for pre-S4000 cards or while debugging | +| `max_num_seqs` | 256 | Max concurrent sequences; reduce on lower-memory cards | +| `gpu_memory_utilization` | 0.85 | Fraction of HBM reserved for KV cache; reduce if OOM | + +## Triton / kernel compilation errors + +If you encounter errors during Triton graph capture on first request, +disable graph capture with `--enforce-eager`: + +```bash +python run.py --runner moorethreads_vllm_musa_57ff5443 \ + --suite suite_F --enforce-eager +``` + +Or set persistently in the runner config YAML: + +```yaml +enforce_eager: true +``` + +## HBM OOM errors + +Reduce `gpu_memory_utilization` and/or `max_num_seqs`, either globally or +per-suite (Suite D is the most memory-hungry due to long-context inputs): + +```yaml +gpu_memory_utilization: 0.80 +max_num_seqs: 128 + +suites: + suite_D: + max_num_seqs: 32 + gpu_memory_utilization: 0.78 +``` + +## Known gaps (pre-smoke-test) + +The following items are placeholders and **must be re-validated** on real +S4000 / S5000 hardware: + +- **Memory peak**: relies on `torch.cuda.max_memory_allocated()` which + torchada aliases to MUSA. If this returns 0 or `None`, fall back to + `pymtml.mtmlDeviceGetMemoryInfo()`. +- **MCCL teardown**: assumes the same `cleanup_dist_env_and_memory` entry + point as upstream vLLM. If MCCL leaves a hanging process group, the + fallback path explicitly destroys the torch.distributed group. +- **Quantization**: `SUPPORTED_QUANTIZATION_BACKENDS` currently lists only + `compressed-tensors`. AWQ / GPTQ-Marlin / FP8 are intentionally excluded + until kernel coverage on MUSA is confirmed. +- **Precision detection**: `_get_chip_count()` prefers `pymtml` over + `torch.cuda.device_count()`. On hosts where pymtml is missing this may + miscount; in that case the torch fallback should still work because + torchada provides `torch.cuda.device_count()`. + +## Requirements + +See `requirements.txt` for the pinned plugin / extras list. The heavy +dependencies (torch + torchada + MUSA toolkit) must come from the Moore +Threads container; do not install them from PyPI. + +Minimum environment: +- Moore Threads MTT S4000 or newer (S3000 / S80 best-effort) +- MUSA toolkit + driver matching card firmware +- torch 2.7.1 (Moore Threads MUSA build) + torchada ≥ 0.1.9 +- Python 3.10+ +- vllm-musa 0.1.1 (vLLM core 0.10.1.1 or 0.13.0) diff --git a/runners/moorethreads_vllm_musa_57ff5443/meta.json b/runners/moorethreads_vllm_musa_57ff5443/meta.json new file mode 100644 index 0000000..655a6ef --- /dev/null +++ b/runners/moorethreads_vllm_musa_57ff5443/meta.json @@ -0,0 +1,21 @@ +{ + "id": "moorethreads_vllm_musa_57ff5443", + "platform": "moorethreads", + "name": "vllm-musa on Moore Threads MUSA GPU", + "framework": "vllm-musa", + "submitted_by": "JuhaoLiang1997", + "description": "AccelMark runner for Moore Threads MTT S4000 / S5000 MUSA GPUs via the vllm-musa platform plugin (vLLM 0.10.x / 0.13.x + torchada CUDA→MUSA compatibility + pymtml). API-compatible with standard vLLM; MCCL-based tensor parallelism. FP8 excluded — not supported on current MUSA hardware. Quantization limited to compressed-tensors (W8A8/W8A16) pending real-hardware validation of AWQ / GPTQ / FP8 paths.", + "supersedes_chain": [], + "notes": "Initial Moore Threads runner. Written from the public vllm-musa documentation and the structural template of ascend_vllm_ascend_d4aa9fda; capability flags, dtype mapping and teardown sequence are placeholders awaiting smoke-testing on real S4000 / S5000 silicon.", + "created": "2026-05-15", + "hardware_label": null, + "suite_support": { + "A": "pending", + "B": "pending", + "C": "pending", + "D": "pending", + "E": "pending", + "F": "pending", + "G": "unsupported" + } +} diff --git a/runners/moorethreads_vllm_musa_57ff5443/requirements.txt b/runners/moorethreads_vllm_musa_57ff5443/requirements.txt new file mode 100644 index 0000000..2a44733 --- /dev/null +++ b/runners/moorethreads_vllm_musa_57ff5443/requirements.txt @@ -0,0 +1,58 @@ +# AccelMark -- Moore Threads MUSA vllm-musa runner dependencies +# +# This runner is designed to run inside the official Moore Threads MUSA +# container (which already ships torch + torchada built for the MUSA +# toolkit) and only installs the vLLM platform plugin + accelmark extras +# on top of it. +# +# Tested image (subject to change at smoke-test time): +# sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120 +# Reference docker command: +# docker run -d --net host --privileged --pid=host --shm-size 500g \ +# -v $PWD:/ws -w /ws \ +# --name accelmark-musa \ +# sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120 \ +# sleep infinity +# docker exec -it accelmark-musa bash +# +# Pre-installed in the container (do NOT reinstall via pip): +# torch==2.7.1 (built for MUSA with torchada) +# torchada>=0.1.9 (CUDA→MUSA compatibility layer) +# mthreads-ml-py>=2.2.5 (pymtml — MTML bindings) +# +# vLLM core: the plugin pulls in a compatible version automatically, but for +# reproducibility we pin to one of the validated combinations below. +# Pick ONE of these two stacks (uncomment the matching line in the install +# guide in README.md): +# +# stack A — vLLM 0.10.1.1 (V0 + V1 engines): +# pip install -e . # plugin auto-installs vllm==0.10.1.1 +# +# stack B — vLLM 0.13.0 (V1-only): +# pip install -e . # plugin installs vllm==0.10.1.1 +# pip install vllm==0.13.0 --no-deps --upgrade +# pip install 'depyf==0.20.0' 'llguidance>=1.3.0,<1.4.0' \ +# 'lm-format-enforcer==0.11.3' 'outlines_core==0.2.11' \ +# 'xgrammar==0.1.27' 'compressed-tensors==0.12.2' + +# vLLM MUSA platform plugin (PyPI: vllm-musa, GitHub: MooreThreads/vllm-musa) +vllm-musa==0.1.1 + +# Transformers stack — pin to versions compatible with vLLM 0.10.x / 0.13.x +transformers==4.46.3 +tokenizers==0.20.3 +huggingface-hub==0.26.5 +accelerate==1.2.1 +safetensors==0.4.5 + +# AccelMark dependencies (not bundled in the image) +numpy==1.26.4 +jsonschema==4.25.1 +psutil==7.1.0 +tqdm==4.67.1 + +# Async support +aiohttp==3.12.15 + +# Config file parsing +PyYAML==6.0.2 diff --git a/runners/moorethreads_vllm_musa_57ff5443/runner.py b/runners/moorethreads_vllm_musa_57ff5443/runner.py new file mode 100644 index 0000000..d753330 --- /dev/null +++ b/runners/moorethreads_vllm_musa_57ff5443/runner.py @@ -0,0 +1,575 @@ +""" +AccelMark — Moore Threads MUSA GPU benchmark runner (vllm-musa). + +Implements BenchmarkRunner for vLLM on Moore Threads MUSA GPUs via the +``vllm-musa`` platform plugin. All orchestration logic lives in +``runners/benchmark_runner.py``. + +The plugin works by patching vLLM at import time: + - ``torchada`` aliases the CUDA Python API onto MUSA + - ``pymtml`` (mthreads-ml-py) provides device queries equivalent to + nvidia-ml-py + - A few Triton attention/worker patches are applied to make the standard + vLLM kernels run on MUSA's Triton compiler. + +As a result, the standard vLLM Python API (``LLM``, ``AsyncLLMEngine``, +``SamplingParams``) is fully preserved. This runner is therefore structurally +identical to the NVIDIA / AMD / Ascend vLLM runners — the differences are +in capability flags, device-count detection, and memory teardown. + +Hardware: Moore Threads MTT S4000 / S5000 (and forward-compatible + successors). S3000 / S80 may also work but are not the public + reference target. +Runtime: MUSA (Meta-computing Unified System Architecture) +Framework: vllm-musa — https://github.com/MooreThreads/vllm-musa + (also published on PyPI as ``vllm-musa``) +Precision: BF16 (preferred on S4000+), FP16 fallback. FP8 not yet + supported on shipping MUSA hardware. +Quantization: compressed-tensors (W8A8 / W8A16) declared by default. AWQ / + GPTQ / FP8 may be added once validated on real hardware. +Multi-chip: Tensor parallelism via MCCL (Moore Threads Collective + Communications Library). vLLM's tensor_parallel_size flag works + unchanged because torchada aliases the NCCL API surface. +Streaming: Fully supported — AsyncLLMEngine API is identical to vLLM. + +Installation (without a real device this is "informational"; final +versions to be confirmed at smoke-test time): + + # 1. Install the MUSA toolkit + driver matching your card firmware: + # https://developer.mthreads.com/musa/ + # 2. Install Moore Threads' PyTorch build (torch + torchada) inside the + # official MUSA container, then: + pip install -r runners/moorethreads_vllm_musa_{hash8}/requirements.txt + +Usage: + + # S5000 single chip + python run.py --runner moorethreads_vllm_musa_{hash8} --suite suite_F + + # Multi-chip tensor parallelism (e.g. 8 x S5000) + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + python run.py --runner moorethreads_vllm_musa_{hash8} \ + --suite suite_B --tensor-parallel-size 8 + +Environment variables you might want to set: + MUSA_VISIBLE_DEVICES — equivalent to CUDA_VISIBLE_DEVICES + VLLM_WORKER_MULTIPROC_METHOD=spawn — recommended for multi-process workers +""" + +import asyncio +import gc +import sys +import time +from pathlib import Path +from typing import Optional + +# Add repo root to path +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(_REPO_ROOT)) + +from runners.benchmark_runner import BenchmarkRunner, InferenceRequest +from loadgen.types import InferenceResult + + +import logging +logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING) +logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING) + + +class MoorethreadsVLLMMUSARunner(BenchmarkRunner): + """ + AccelMark benchmark runner using ``vllm-musa`` on Moore Threads MUSA GPUs. + + ``vllm-musa`` is registered as a vLLM platform plugin and is auto-detected + on ``import vllm``. The plugin activates the MUSA backend when: + - the plugin package is installed in the environment + - Moore Threads devices are visible to the process + + The inference methods below are byte-for-byte identical in shape to the + NVIDIA vLLM runner — platform-specific logic is isolated to + ``_get_chip_count()``, ``load_model()``, ``get_peak_memory_gb()``, and + ``release_resources()``. + """ + + SUPPORTS_STREAMING = True + SUPPORTS_BATCHING = True + SUPPORTS_ONLINE = True + SUPPORTS_MULTI_CHIP = True # MCCL-based tensor parallelism on multi-card hosts + + # S4000 / S5000 advertise native BF16 for LLM workloads; FP16 always works + # as a fallback. FP32 is left in the list for completeness but is rarely + # used for inference. FP8 is excluded entirely — current shipping MUSA + # hardware does not expose native FP8 datapaths. + SUPPORTED_PRECISIONS = ["bf16", "fp16", "fp32"] + + # Quantization backends — start conservative. compressed-tensors is the + # safe default on every modern vLLM build because the kernels are pure + # Triton + PyTorch matmuls and so are reachable through torchada. + # Marlin / AWQ-CUDA / native FP8 require kernel-level validation on MUSA + # and should be added in a follow-up runner version after real-hardware + # smoke tests, not silently flipped on here. + SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors"] + + def __init__(self): + self.llm = None # vllm.LLM (offline / accuracy) + self.engine = None # vllm.AsyncLLMEngine (online / interactive) + self.tokenizer = None + self.sampling_params = None + self._loop: asyncio.AbstractEventLoop = None + + # ── Metadata ───────────────────────────────────────────────────────────── + + def _get_chip_count(self) -> int: + """Return the number of available Moore Threads MUSA GPUs. + + Preference order: + 1. ``pymtml`` (the Moore Threads management library, equivalent to + nvidia-ml-py). Most reliable because it queries the driver + directly and is not affected by ``MUSA_VISIBLE_DEVICES`` if + called before any ``torch`` initialisation. + 2. ``torch.cuda.device_count()`` — torchada aliases ``torch.cuda`` + to MUSA so this returns the visible MUSA device count in the + current process (respecting ``MUSA_VISIBLE_DEVICES``). + """ + try: + import pymtml + pymtml.mtmlInit() + try: + n = pymtml.mtmlDeviceGetCount() + finally: + try: + pymtml.mtmlShutdown() + except Exception: + pass + if n and n > 0: + return int(n) + except Exception: + pass + + try: + import torch + n = torch.cuda.device_count() + return n if n > 0 else 1 + except Exception: + return 1 + + def _get_framework_name(self) -> str: + # The leaderboard groups by framework string; keep this distinct from + # plain "vLLM" so MUSA results are not silently mixed with CUDA results. + return "vllm-musa" + + def _get_framework_version(self) -> str: + """Report vllm-musa plugin version, with vLLM core version appended. + + The plugin version is the meaningful identifier (it pins the patch + set), but the underlying vLLM core version is what generates kernels + and parses configs. Reporting both makes results reproducible. + """ + plugin_version = "unknown" + try: + from importlib.metadata import version + plugin_version = version("vllm-musa") + except Exception: + try: + import vllm_musa_platform # type: ignore + plugin_version = getattr(vllm_musa_platform, "__version__", "unknown") + except Exception: + pass + + core_version = "unknown" + try: + import vllm + core_version = vllm.__version__ + except Exception: + pass + + if plugin_version == "unknown" and core_version == "unknown": + return "unknown" + if plugin_version == "unknown": + return core_version + return f"{plugin_version}+vllm-{core_version}" + + def get_model_format(self) -> str: + return "HuggingFace original" + + # ── Model loading ──────────────────────────────────────────────────────── + + def load_model(self, model_path: str, parallelism: dict) -> None: + """ + Load model onto Moore Threads MUSA GPU(s) via vllm-musa. + + vllm-musa uses the standard vLLM ``LLM`` / ``AsyncLLMEngine`` + constructors. The MUSA backend activates automatically when the + plugin package is installed and Moore Threads devices are present — + no explicit device flag is required in engine kwargs. + + Pipeline parallelism is not supported (matches the vLLM CUDA backend + behaviour). Use ``tensor_parallel_size`` for multi-chip runs. + """ + from transformers import AutoTokenizer + from vllm import LLM, AsyncLLMEngine, SamplingParams + from vllm.engine.arg_utils import AsyncEngineArgs + + tp_size = parallelism["tensor_parallel_size"] + pp_size = parallelism["pipeline_parallel_size"] + ep_size = parallelism.get("expert_parallel_size", 1) + assert pp_size <= 1, ( + "Pipeline parallelism (pp_size > 1) is not supported in " + "MoorethreadsVLLMMUSARunner. Use --tensor-parallel-size for " + "multi-chip runs." + ) + + max_tokens = parallelism["max_tokens"] + max_model_len = parallelism["max_model_len"] + use_async = parallelism["use_async"] + enforce_eager = getattr(self, "_enforce_eager", False) + + cfg = getattr(self, "_runner_config", {}) + max_num_seqs = cfg.get("max_num_seqs", 256) + # vLLM's flag name is gpu_memory_utilization, but on MUSA it controls + # the per-card HBM fraction reserved for the KV cache. We keep the + # vLLM name to stay schema-compatible with other runners' configs. + musa_memory_util = cfg.get("gpu_memory_utilization", 0.85) + extra_kwargs = dict(cfg.get("engine_kwargs") or {}) + + # Filter engine_kwargs to only fields the installed vLLM version + # accepts. EngineArgs is a strict dataclass — unknown kwargs raise + # TypeError at construction. vllm-musa supports vLLM 0.10.x and 0.13.x, + # whose EngineArgs fields differ slightly; filtering keeps the YAML + # forward-compatible. + try: + import dataclasses + from vllm.engine.arg_utils import EngineArgs as _EngineArgs + _valid = {f.name for f in dataclasses.fields(_EngineArgs)} + _dropped = {k: v for k, v in extra_kwargs.items() if k not in _valid} + if _dropped: + print(f" Warning: engine_kwargs keys not supported by this " + f"vllm-musa / vLLM version and will be ignored: " + f"{list(_dropped)}") + extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid} + except Exception: + pass + + effective_precision = getattr(self, "_effective_precision", "BF16").upper() + precision = getattr(self, "_precision", None) or effective_precision + + _dtype_override = getattr(self, "_precision_dtype_override", None) + _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {}) + quantization = _prec_eng_kwargs.pop("quantization", None) + + _NATIVE_DTYPE_MAP = {"BF16": "bfloat16", "FP16": "float16", "FP32": "float32"} + dtype = _NATIVE_DTYPE_MAP.get(precision, "auto") + self._quantization_method = quantization + + if _dtype_override: + dtype = _dtype_override + if _prec_eng_kwargs: + _prec_eng_kwargs.update(extra_kwargs) + extra_kwargs = _prec_eng_kwargs + + # Translate the runner's flat speculative-decoding keys into the + # dict-form ``speculative_config`` used by recent vLLM versions. Skip + # if the user already provided ``speculative_config`` directly. + if "speculative_model" in extra_kwargs and "speculative_config" not in extra_kwargs: + extra_kwargs["speculative_config"] = { + "model": extra_kwargs.pop("speculative_model"), + "num_speculative_tokens": extra_kwargs.pop("num_speculative_tokens", 4), + "draft_tensor_parallel_size": extra_kwargs.pop( + "speculative_draft_tensor_parallel_size", 1 + ), + } + + print( + f"Loading model: precision={precision}, dtype={dtype}" + + (f", quantization_method={self._quantization_method}" + if self._quantization_method else "") + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=False + ) + + self.sampling_params = SamplingParams( + max_tokens=max_tokens, + temperature=0.0, + ) + + base_kwargs = dict( + model=model_path, + dtype=dtype, + tensor_parallel_size=tp_size, + trust_remote_code=False, + enforce_eager=enforce_eager, + ) + if ep_size > 1: + base_kwargs["enable_expert_parallel"] = True + if quantization: + base_kwargs["quantization"] = quantization + if max_model_len: + base_kwargs["max_model_len"] = max_model_len + + if not use_async: + self.llm = LLM(**{ + **base_kwargs, + "max_num_seqs": max_num_seqs, + "gpu_memory_utilization": musa_memory_util, + **extra_kwargs, + }) + else: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + engine_args = AsyncEngineArgs(**{ + **base_kwargs, + "gpu_memory_utilization": musa_memory_util, + **extra_kwargs, + }) + self.engine = AsyncLLMEngine.from_engine_args(engine_args) + + def get_effective_dtype(self) -> Optional[str]: + """Report the actual compute dtype vllm-musa resolved after loading.""" + try: + if self.llm is not None: + return str(self.llm.llm_engine.model_config.dtype).replace("torch.", "") + elif self.engine is not None: + return str(self.engine.engine.model_config.dtype).replace("torch.", "") + except Exception: + pass + return getattr(self, "_effective_dtype", None) + + # ── Inference ──────────────────────────────────────────────────────────── + + def inference_fn_offline( + self, requests: list[InferenceRequest] + ) -> list[InferenceResult]: + """ + Synchronous batch inference via vllm-musa LLM.generate(). + total_time_ms is wall-clock elapsed time for the full batch. + """ + formatted = [self._format_prompt(r.prompt) for r in requests] + t_start = time.perf_counter() + outputs = self.llm.generate(formatted, self.sampling_params) + elapsed = time.perf_counter() - t_start + + self._last_accuracy_outputs = [o.outputs[0].text for o in outputs] + + results = [] + for output in outputs: + results.append(InferenceResult( + first_token_time_ms=None, + total_time_ms=elapsed * 1000, + output_tokens=len(output.outputs[0].token_ids), + input_tokens=len(output.prompt_token_ids), + success=True, + output_text=output.outputs[0].text, + )) + return results + + async def inference_fn_streaming( + self, request: InferenceRequest + ) -> InferenceResult: + """Async streaming for TTFT — API identical to NVIDIA vLLM runner.""" + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + t_start = time.perf_counter() + first_token_time_ms = None + output_tokens = 0 + output_text = "" + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + if ( + first_token_time_ms is None + and len(output.outputs[0].token_ids) > 0 + ): + first_token_time_ms = (time.perf_counter() - t_start) * 1000 + output_tokens = len(output.outputs[0].token_ids) + output_text = output.outputs[0].text + + total_time_ms = (time.perf_counter() - t_start) * 1000 + return InferenceResult( + first_token_time_ms=first_token_time_ms, + total_time_ms=total_time_ms, + output_tokens=output_tokens, + input_tokens=0, + success=True, + output_text=output_text, + ) + + async def inference_fn_token_stream(self, request: InferenceRequest): + """Async generator yielding text deltas for serve-layer SSE.""" + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + prev_length = 0 + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + current_text = output.outputs[0].text + delta = current_text[prev_length:] + if delta: + yield delta + prev_length = len(current_text) + + # ── Memory & teardown ──────────────────────────────────────────────────── + + def get_peak_memory_gb(self) -> Optional[float]: + """Query peak HBM usage on the active MUSA device. + + torchada aliases ``torch.cuda.max_memory_allocated()`` onto MUSA, so + the standard CUDA API returns peak MUSA memory. We fall back to + ``pymtml`` if torch is unavailable for some reason. + """ + try: + import torch + return torch.cuda.max_memory_allocated() / (1024 ** 3) + except Exception: + pass + # pymtml fallback — returns currently-used memory, not strictly peak, + # but useful when torch.cuda is gone. + try: + import pymtml + pymtml.mtmlInit() + try: + dev = pymtml.mtmlDeviceGetByIndex(0) + info = pymtml.mtmlDeviceGetMemoryInfo(dev) + used = getattr(info, "used", None) + if used is not None: + return float(used) / (1024 ** 3) + finally: + try: + pymtml.mtmlShutdown() + except Exception: + pass + except Exception: + pass + return None + + def release_resources(self) -> None: + """ + Release vllm-musa engines and MUSA memory. + + Teardown order mirrors the NVIDIA runner: + 1. Shut down async engine (if online/interactive was used) + 2. Delete engine objects to trigger Python GC + 3. vLLM distributed-state cleanup (cleanup_dist_env_and_memory) + 4. MCCL / torch.distributed process group destruction + 5. MUSA memory cache flush via torch.cuda (aliased to MUSA by torchada) + """ + if self.llm is not None: + try: + del self.llm + except Exception: + pass + self.llm = None + + if self.engine is not None: + try: + if self._loop and not self._loop.is_closed(): + self._loop.run_until_complete(self.engine.shutdown()) + except Exception: + pass + try: + del self.engine + except Exception: + pass + self.engine = None + + # vLLM distributed state cleanup. cleanup_dist_env_and_memory is the + # same entry point as upstream vLLM — vllm-musa patches the internals + # but keeps the public function name. + try: + from vllm.distributed.parallel_state import cleanup_dist_env_and_memory + cleanup_dist_env_and_memory(shutdown_ray=False) + except Exception: + try: + from vllm.distributed.parallel_state import ( + destroy_model_parallel, + destroy_distributed_environment, + ) + destroy_model_parallel() + destroy_distributed_environment() + except Exception: + pass + + # Destroy the active torch.distributed process group. On MUSA the + # backend is MCCL (Moore Threads Collective Communications Library) + # but is exposed through the standard torch.distributed.destroy_process_group + # entry point thanks to torchada. + try: + import torch + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + except Exception: + pass + + gc.collect() + + # Flush MUSA memory cache. torch.cuda.* is aliased to MUSA by torchada, + # so the standard CUDA cache-management APIs work without modification. + try: + import torch + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + except Exception: + pass + + # ── Argument parsing ───────────────────────────────────────────────────── + + def parse_args(self): + """Add vllm-musa / Moore Threads-specific CLI flags.""" + args = super().parse_args() + cfg = self._runner_config + + import argparse + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument("--tensor-parallel-size", type=int, default=None, + dest="tensor_parallel_size") + parser.add_argument("--expert-parallel-size", type=int, default=None, + dest="expert_parallel_size") + parser.add_argument("--enforce-eager", action="store_true", default=False, + dest="enforce_eager") + extra, _ = parser.parse_known_args() + + tp_size, _tp_source = self._resolve_tensor_parallel_size( + extra.tensor_parallel_size + ) + ep_size = (extra.expert_parallel_size + if extra.expert_parallel_size is not None + else cfg.get("expert_parallel_size", 1)) + + self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False) + + print(f" tensor_parallel_size = {tp_size} [{_tp_source}]") + if ep_size > 1: + print(f" expert_parallel_size = {ep_size} [cli/yaml]") + + self._parallelism = { + "tensor_parallel_size": tp_size, + "pipeline_parallel_size": 1, + "expert_parallel_size": ep_size, + "data_parallel_size": 1, + } + self._chip_count = tp_size + return args + + def get_extra_subprocess_args(self, args) -> list[str]: + """Forward vllm-musa / Moore Threads-specific flags to subprocesses.""" + extra = [ + "--tensor-parallel-size", + str(self._parallelism.get("tensor_parallel_size", 1)), + ] + if self._parallelism.get("expert_parallel_size", 1) > 1: + extra += ["--expert-parallel-size", + str(self._parallelism["expert_parallel_size"])] + if self._enforce_eager: + extra += ["--enforce-eager"] + return extra + + +if __name__ == "__main__": + MoorethreadsVLLMMUSARunner().main() From 61b977ed0f3dc7cbc9681cce93520dace1d6e699 Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Mon, 18 May 2026 16:51:21 +0800 Subject: [PATCH 2/5] update moore runner --- README.md | 2 +- ...rethreads_vllm_musa_f2f6f965.yaml.example} | 6 +- runners/README.md | 2 +- .../moorethreads_vllm_musa_57ff5443/README.md | 200 ------------ .../moorethreads_vllm_musa_57ff5443/meta.json | 21 -- .../requirements.txt | 58 ---- .../moorethreads_vllm_musa_f2f6f965/README.md | 145 +++++++++ .../moorethreads_vllm_musa_f2f6f965/meta.json | 21 ++ .../requirements.txt | 22 ++ .../runner.py | 303 +++++------------- .../test_smoke.py | 77 +++++ runners/platforms/moorethreads.py | 192 +++++++---- 12 files changed, 487 insertions(+), 562 deletions(-) rename configs/runner_configs/{runner_moorethreads_vllm_musa_57ff5443.yaml.example => runner_moorethreads_vllm_musa_f2f6f965.yaml.example} (91%) delete mode 100644 runners/moorethreads_vllm_musa_57ff5443/README.md delete mode 100644 runners/moorethreads_vllm_musa_57ff5443/meta.json delete mode 100644 runners/moorethreads_vllm_musa_57ff5443/requirements.txt create mode 100644 runners/moorethreads_vllm_musa_f2f6f965/README.md create mode 100644 runners/moorethreads_vllm_musa_f2f6f965/meta.json create mode 100644 runners/moorethreads_vllm_musa_f2f6f965/requirements.txt rename runners/{moorethreads_vllm_musa_57ff5443 => moorethreads_vllm_musa_f2f6f965}/runner.py (52%) create mode 100644 runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py diff --git a/README.md b/README.md index 92cec27..2ca3d64 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — | | Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — | -| Moore Threads GPU | `moorethreads_vllm_musa_57ff5443` | vllm-musa | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | +| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | _Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._ diff --git a/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example similarity index 91% rename from configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example rename to configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example index 5c8f878..c18f98b 100644 --- a/configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example +++ b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example @@ -1,6 +1,6 @@ -# AccelMark runner config — moorethreads_vllm_musa_57ff5443 (vllm-musa on Moore Threads) +# AccelMark runner config — moorethreads_vllm_musa_f2f6f965 (vllm-musa on Moore Threads) # -# Copy this file to runner_moorethreads_vllm_musa_57ff5443.yaml (remove +# Copy this file to runner_moorethreads_vllm_musa_f2f6f965.yaml (remove # .example suffix) and edit as needed for your hardware. The actual .yaml # is gitignored. # @@ -45,8 +45,6 @@ suites: gpu_memory_utilization: 0.80 suite_F: - # Consumer / edge GPU — enforce_eager often safer for first runs. - # enforce_eager: true max_num_seqs: 128 # ── Speculative decoding (suite_A / suite_D extra scenario) ───────────────── diff --git a/runners/README.md b/runners/README.md index 95290aa..aaf4d81 100644 --- a/runners/README.md +++ b/runners/README.md @@ -252,7 +252,7 @@ nvidia_trtllm_fp8_8d2f1a4b amd_vllm_rocm_7b2e1d8f ascend_mindie_9c4a3f11 apple_mlx_b3e21f09 -moorethreads_vllm_musa_57ff5443 +moorethreads_vllm_musa_f2f6f965 ``` --- diff --git a/runners/moorethreads_vllm_musa_57ff5443/README.md b/runners/moorethreads_vllm_musa_57ff5443/README.md deleted file mode 100644 index 82411a1..0000000 --- a/runners/moorethreads_vllm_musa_57ff5443/README.md +++ /dev/null @@ -1,200 +0,0 @@ -# moorethreads_vllm_musa_57ff5443 — Moore Threads MUSA Runner (vllm-musa) - -AccelMark runner for Moore Threads MUSA GPUs using -[vllm-musa](https://github.com/MooreThreads/vllm-musa), the official vLLM -platform plugin for MUSA hardware. - -> **Status:** This runner is **untested on real silicon at the time of -> commit**. The code is written against the public `vllm-musa` plugin -> documentation and follows the structural template of the -> `ascend_vllm_ascend_*` runner. Plan to smoke-test on an S5000 / S4000 -> system; capability flags and dtype mappings may be adjusted in a follow-up -> runner version (new hash, new folder) based on real-world findings. - -## How vllm-musa works - -`vllm-musa` is a vLLM **platform plugin** (auto-detected on `import vllm`) -that makes the standard vLLM Python API run on Moore Threads MUSA GPUs. It -relies on three components: - -| Component | Role | -|---|---| -| `torchada` | CUDA→MUSA compatibility layer for PyTorch — aliases `torch.cuda.*` to MUSA so most code paths run unmodified | -| `pymtml` (`mthreads-ml-py`) | Moore Threads Management Library bindings, equivalent to `nvidia-ml-py` | -| Triton patches | Runtime monkey-patches in `vllm_musa_platform.patches.*` that fix `triton.attention` and `worker` modules for MUSA's Triton compiler | - -The standard `vllm.LLM`, `vllm.AsyncLLMEngine`, and `vllm.SamplingParams` -remain the entry points — this runner therefore reuses ~95% of the logic -from the NVIDIA / Ascend vLLM runners. - -## Supported suites - -| Suite | Description | Notes | -|-------|-------------|-------| -| Suite A | Single-chip, Llama-3-8B | Pending smoke test on S4000 / S5000 | -| Suite B | Multi-chip, Llama-3-70B | Requires multiple Moore Threads cards + MCCL TP | -| Suite C | Quantization, Llama-3.1-8B | FP8 skipped (no native FP8 in current MUSA hardware); compressed-tensors W8A8/W8A16 candidate; AWQ / GPTQ pending validation | -| Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` and `gpu_memory_utilization` | -| Suite E | Multi-chip scaling, Llama-3-8B | Validates MCCL tensor parallelism | -| Suite F | Consumer/edge, Qwen2.5-0.5B | Recommended starting point for S4000 single-card systems | - -## Hardware compatibility - -| GPU | BF16 | TP via MCCL | FP8 | Notes | -|-----|------|-------------|-----|-------| -| MTT S5000 | ✅ | ✅ | ❌ | Recommended public reference target (FA3 via MATE) | -| MTT S4000 | ✅ | ✅ | ❌ | Validated path with PyTorch SDPA-based FlashAttention | -| MTT S3000 | ⚠️ | ⚠️ | ❌ | May work via `--enforce-eager`; not the public reference | -| MTT S80 | ⚠️ | — | ❌ | Consumer card; treat as best-effort | - -## Prerequisites - -You must install the MUSA stack in this exact order — Python packages alone -are not sufficient: - -**1. MUSA toolkit + driver** - -Match the toolkit version to your card firmware. Reference: - - -**2. PyTorch with MUSA support (torch + torchada)** - -The recommended path is the official Moore Threads container, which ships a -pre-built `torch==2.7.1` together with `torchada` and `pymtml`. See: - -```bash -docker pull sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120 -``` - -**3. Runner dependencies** - -Inside the MUSA container: - -```bash -pip install -r runners/moorethreads_vllm_musa_57ff5443/requirements.txt -``` - -This installs `vllm-musa==0.1.1` which auto-pulls a validated vLLM core -(`0.10.1.1` by default). To use vLLM `0.13.0` instead (V1-only engine): - -```bash -pip install vllm==0.13.0 --no-deps --upgrade -pip install 'depyf==0.20.0' 'llguidance>=1.3.0,<1.4.0' \ - 'lm-format-enforcer==0.11.3' 'outlines_core==0.2.11' \ - 'xgrammar==0.1.27' 'compressed-tensors==0.12.2' -``` - -## Required environment variables - -```bash -# Device visibility (works like CUDA_VISIBLE_DEVICES) -export MUSA_VISIBLE_DEVICES=0,1,2,3 - -# Recommended for multi-process workers (TP > 1) -export VLLM_WORKER_MULTIPROC_METHOD=spawn -``` - -## Basic usage - -```bash -# Verify the plugin is loaded before running anything else -python -c "from vllm_musa_platform import musa_platform_plugin; print('ok')" - -# Suite F (single-card S4000 / S5000) -python run.py --runner moorethreads_vllm_musa_57ff5443 --suite suite_F - -# Suite A (single-card datacenter benchmark) -python run.py --runner moorethreads_vllm_musa_57ff5443 --suite suite_A - -# Multi-card tensor parallelism (e.g. 8 x S5000 on a single host) -VLLM_WORKER_MULTIPROC_METHOD=spawn \ -python run.py --runner moorethreads_vllm_musa_57ff5443 \ - --suite suite_B \ - --tensor-parallel-size 8 - -# Local model cache -python run.py --runner moorethreads_vllm_musa_57ff5443 \ - --suite suite_A \ - --model-path /data/models/Meta-Llama-3-8B-Instruct -``` - -## Runner config - -Copy the example config and adjust for your hardware: - -```bash -cp configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml.example \ - configs/runner_configs/runner_moorethreads_vllm_musa_57ff5443.yaml -``` - -Key settings: - -| Field | Default | Notes | -|-------|---------|-------| -| `tensor_parallel_size` | 1 | Number of MUSA GPUs for tensor parallelism | -| `enforce_eager` | false | Disable CUDA-graph / compilation; useful for pre-S4000 cards or while debugging | -| `max_num_seqs` | 256 | Max concurrent sequences; reduce on lower-memory cards | -| `gpu_memory_utilization` | 0.85 | Fraction of HBM reserved for KV cache; reduce if OOM | - -## Triton / kernel compilation errors - -If you encounter errors during Triton graph capture on first request, -disable graph capture with `--enforce-eager`: - -```bash -python run.py --runner moorethreads_vllm_musa_57ff5443 \ - --suite suite_F --enforce-eager -``` - -Or set persistently in the runner config YAML: - -```yaml -enforce_eager: true -``` - -## HBM OOM errors - -Reduce `gpu_memory_utilization` and/or `max_num_seqs`, either globally or -per-suite (Suite D is the most memory-hungry due to long-context inputs): - -```yaml -gpu_memory_utilization: 0.80 -max_num_seqs: 128 - -suites: - suite_D: - max_num_seqs: 32 - gpu_memory_utilization: 0.78 -``` - -## Known gaps (pre-smoke-test) - -The following items are placeholders and **must be re-validated** on real -S4000 / S5000 hardware: - -- **Memory peak**: relies on `torch.cuda.max_memory_allocated()` which - torchada aliases to MUSA. If this returns 0 or `None`, fall back to - `pymtml.mtmlDeviceGetMemoryInfo()`. -- **MCCL teardown**: assumes the same `cleanup_dist_env_and_memory` entry - point as upstream vLLM. If MCCL leaves a hanging process group, the - fallback path explicitly destroys the torch.distributed group. -- **Quantization**: `SUPPORTED_QUANTIZATION_BACKENDS` currently lists only - `compressed-tensors`. AWQ / GPTQ-Marlin / FP8 are intentionally excluded - until kernel coverage on MUSA is confirmed. -- **Precision detection**: `_get_chip_count()` prefers `pymtml` over - `torch.cuda.device_count()`. On hosts where pymtml is missing this may - miscount; in that case the torch fallback should still work because - torchada provides `torch.cuda.device_count()`. - -## Requirements - -See `requirements.txt` for the pinned plugin / extras list. The heavy -dependencies (torch + torchada + MUSA toolkit) must come from the Moore -Threads container; do not install them from PyPI. - -Minimum environment: -- Moore Threads MTT S4000 or newer (S3000 / S80 best-effort) -- MUSA toolkit + driver matching card firmware -- torch 2.7.1 (Moore Threads MUSA build) + torchada ≥ 0.1.9 -- Python 3.10+ -- vllm-musa 0.1.1 (vLLM core 0.10.1.1 or 0.13.0) diff --git a/runners/moorethreads_vllm_musa_57ff5443/meta.json b/runners/moorethreads_vllm_musa_57ff5443/meta.json deleted file mode 100644 index 655a6ef..0000000 --- a/runners/moorethreads_vllm_musa_57ff5443/meta.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "id": "moorethreads_vllm_musa_57ff5443", - "platform": "moorethreads", - "name": "vllm-musa on Moore Threads MUSA GPU", - "framework": "vllm-musa", - "submitted_by": "JuhaoLiang1997", - "description": "AccelMark runner for Moore Threads MTT S4000 / S5000 MUSA GPUs via the vllm-musa platform plugin (vLLM 0.10.x / 0.13.x + torchada CUDA→MUSA compatibility + pymtml). API-compatible with standard vLLM; MCCL-based tensor parallelism. FP8 excluded — not supported on current MUSA hardware. Quantization limited to compressed-tensors (W8A8/W8A16) pending real-hardware validation of AWQ / GPTQ / FP8 paths.", - "supersedes_chain": [], - "notes": "Initial Moore Threads runner. Written from the public vllm-musa documentation and the structural template of ascend_vllm_ascend_d4aa9fda; capability flags, dtype mapping and teardown sequence are placeholders awaiting smoke-testing on real S4000 / S5000 silicon.", - "created": "2026-05-15", - "hardware_label": null, - "suite_support": { - "A": "pending", - "B": "pending", - "C": "pending", - "D": "pending", - "E": "pending", - "F": "pending", - "G": "unsupported" - } -} diff --git a/runners/moorethreads_vllm_musa_57ff5443/requirements.txt b/runners/moorethreads_vllm_musa_57ff5443/requirements.txt deleted file mode 100644 index 2a44733..0000000 --- a/runners/moorethreads_vllm_musa_57ff5443/requirements.txt +++ /dev/null @@ -1,58 +0,0 @@ -# AccelMark -- Moore Threads MUSA vllm-musa runner dependencies -# -# This runner is designed to run inside the official Moore Threads MUSA -# container (which already ships torch + torchada built for the MUSA -# toolkit) and only installs the vLLM platform plugin + accelmark extras -# on top of it. -# -# Tested image (subject to change at smoke-test time): -# sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120 -# Reference docker command: -# docker run -d --net host --privileged --pid=host --shm-size 500g \ -# -v $PWD:/ws -w /ws \ -# --name accelmark-musa \ -# sh-harbor.mthreads.com/mcctest/musa-compile:rc4.3.3-torch2.7-20251120 \ -# sleep infinity -# docker exec -it accelmark-musa bash -# -# Pre-installed in the container (do NOT reinstall via pip): -# torch==2.7.1 (built for MUSA with torchada) -# torchada>=0.1.9 (CUDA→MUSA compatibility layer) -# mthreads-ml-py>=2.2.5 (pymtml — MTML bindings) -# -# vLLM core: the plugin pulls in a compatible version automatically, but for -# reproducibility we pin to one of the validated combinations below. -# Pick ONE of these two stacks (uncomment the matching line in the install -# guide in README.md): -# -# stack A — vLLM 0.10.1.1 (V0 + V1 engines): -# pip install -e . # plugin auto-installs vllm==0.10.1.1 -# -# stack B — vLLM 0.13.0 (V1-only): -# pip install -e . # plugin installs vllm==0.10.1.1 -# pip install vllm==0.13.0 --no-deps --upgrade -# pip install 'depyf==0.20.0' 'llguidance>=1.3.0,<1.4.0' \ -# 'lm-format-enforcer==0.11.3' 'outlines_core==0.2.11' \ -# 'xgrammar==0.1.27' 'compressed-tensors==0.12.2' - -# vLLM MUSA platform plugin (PyPI: vllm-musa, GitHub: MooreThreads/vllm-musa) -vllm-musa==0.1.1 - -# Transformers stack — pin to versions compatible with vLLM 0.10.x / 0.13.x -transformers==4.46.3 -tokenizers==0.20.3 -huggingface-hub==0.26.5 -accelerate==1.2.1 -safetensors==0.4.5 - -# AccelMark dependencies (not bundled in the image) -numpy==1.26.4 -jsonschema==4.25.1 -psutil==7.1.0 -tqdm==4.67.1 - -# Async support -aiohttp==3.12.15 - -# Config file parsing -PyYAML==6.0.2 diff --git a/runners/moorethreads_vllm_musa_f2f6f965/README.md b/runners/moorethreads_vllm_musa_f2f6f965/README.md new file mode 100644 index 0000000..e963d18 --- /dev/null +++ b/runners/moorethreads_vllm_musa_f2f6f965/README.md @@ -0,0 +1,145 @@ +# moorethreads_vllm_musa_f2f6f965 — Moore Threads MUSA Runner (vllm-musa) + +AccelMark runner for Moore Threads MUSA GPUs using +[vllm-musa](https://github.com/MooreThreads/vllm-musa). + +## Supported suites + +| Suite | Description | Notes | +|-------|-------------|-------| +| Suite A | Single-chip, Llama-3-8B | Smoke tested on MTT S4000; accuracy not at baseline on vLLM 0.4.x | +| Suite B | Multi-chip, Llama-3-70B | MCCL tensor parallelism; set `VLLM_WORKER_MULTIPROC_METHOD=spawn` | +| Suite C | Quantization, Llama-3.1-8B | FP8 skipped (not supported); W8A8/W8A16 via compressed-tensors | +| Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` / `gpu_memory_utilization` in runner config | +| Suite E | Multi-chip scaling, Llama-3-8B | MCCL tensor parallelism | +| Suite F | Edge, Qwen2.5-0.5B | Smoke tested on MTT S4000; recommended first run | +| Suite G | MoE multi-chip, Mixtral-8x7B | Unsupported | + +## Hardware compatibility + +| GPU | BF16 / FP16 | Multi-chip TP | FP8 | Notes | +|-----|-------------|---------------|-----|-------| +| MTT S4000 / S5000 | ✅ (BF16 → float16 on vLLM < 0.10) | ✅ (MCCL) | ❌ | Tested with vLLM 0.4.x+musa | +| MTT S3000 / S80 | ✅ | ✅ | ❌ | May need `--enforce-eager` on Triton errors | + +FP8 is excluded — not supported on this runner. FP32 inference fails with +FlashAttention on MUSA (use FP16 or BF16). Qwen3 requires a newer vLLM + MUSA port +(Qwen2.5 / Llama-3 work on 0.4.x). + +## Prerequisites + +Install in this order — **do not** `pip install torch` or `vllm` from PyPI on a +bare Linux host: + +**1. MUSA toolkit + driver** + + + +**2. vllm-musa (official build)** + +| Resource | URL | +|----------|-----| +| Repository | | +| Build guide | [README_vllm_musa.md](https://github.com/MooreThreads/vllm-musa/blob/main/README_vllm_musa.md) | +| PyTorch MUSA | | + +```bash +git clone https://github.com/MooreThreads/vllm-musa.git +cd vllm-musa +bash build_musa.sh +python -c "from vllm import LLM; print('vllm ok')" +``` + +**3. Runner dependencies** + +```bash +pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt +``` + +Pin `transformers` to **4.40–4.46** (not 5.x) when on vLLM 0.4.x. + +**Environment variables** + +```bash +export MUSA_VISIBLE_DEVICES=0 +export VLLM_WORKER_MULTIPROC_METHOD=spawn # when tensor_parallel_size > 1 +``` + +## Smoke test + +```bash +python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py +python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model +``` + +## Accuracy + +AccelMark runs an integrated MMLU subset after each benchmark using the **same** +vLLM instance as the perf run. The runner sets `device=musa`, dtype, and +tokenizer correctly; low scores on vLLM **0.4.x+musa** reflect broken generation +in that stack, not missing AccelMark wiring. + +| Model | Suite | Measured | Baseline | +|-------|-------|----------|----------| +| Qwen2.5-0.5B-Instruct | F | **~0.07** | 0.37 (FP16) / 0.38 (BF16) | +| Llama-3-8B-Instruct | A | **~0.07** | 0.60 (BF16) | + +Throughput completes normally; answers are effectively random (repetition, system +prompt regurgitation, similar ~7% across different models). + +While accuracy is broken on 0.4.x, use `--skip-accuracy-gate` to finish a perf run: + +```bash +python run.py --runner moorethreads_vllm_musa_f2f6f965 \ + --suite suite_F --precision FP16 --skip-accuracy-gate +``` + +Likely fix: upgrade to vllm-musa aligned with vLLM **0.10+**, keep +`transformers` 4.40–4.46 on legacy forks, then re-run without +`--skip-accuracy-gate`. + +## Usage + +```bash +python run.py --runner moorethreads_vllm_musa_f2f6f965 --suite suite_F --precision FP16 + +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +python run.py --runner moorethreads_vllm_musa_f2f6f965 \ + --suite suite_B --tensor-parallel-size 8 +``` + +Optional runner config (copy and edit): + +```bash +cp configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example \ + configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml +``` + +| Field | Default | Notes | +|-------|---------|-------| +| `tensor_parallel_size` | 1 | MCCL tensor parallelism | +| `enforce_eager` | false | Only if Triton / graph capture errors | +| `max_num_seqs` | 256 | Lower on small HBM | +| `gpu_memory_utilization` | 0.85 | Lower if OOM | + +## Troubleshooting + +| Symptom | Fix | +|---------|-----| +| `GLIBCXX_3.4.30` on import | Import `torch` before `transformers` (runner and smoke test do this) | +| `KeyError: 'type'` in rope_scaling | Pin `transformers==4.46.3` (not 5.x) | +| `Expected musa device, got cuda:0` | Use this runner (`device="musa"`) | +| MMLU ~0.07 | See [Accuracy](#accuracy); `--skip-accuracy-gate` for perf-only runs | +| OOM | Lower `gpu_memory_utilization` / `max_num_seqs` | +| Triton / graph errors | `--enforce-eager` or `enforce_eager: true` in runner YAML | + +## Requirements + +See `requirements.txt` for AccelMark extras. vLLM, torch_musa, and the MUSA +driver are installed per the official vllm-musa guide above (not from this file). + +Minimum environment: + +- Moore Threads GPU with MUSA driver +- Python 3.10+ +- vllm-musa build per [MooreThreads/vllm-musa](https://github.com/MooreThreads/vllm-musa) diff --git a/runners/moorethreads_vllm_musa_f2f6f965/meta.json b/runners/moorethreads_vllm_musa_f2f6f965/meta.json new file mode 100644 index 0000000..9d5728d --- /dev/null +++ b/runners/moorethreads_vllm_musa_f2f6f965/meta.json @@ -0,0 +1,21 @@ +{ + "id": "moorethreads_vllm_musa_f2f6f965", + "platform": "moorethreads", + "name": "vllm-musa on Moore Threads MUSA GPU", + "framework": "vllm-musa", + "submitted_by": "JuhaoLiang1997", + "description": "AccelMark runner for Moore Threads MUSA GPUs using vllm-musa (https://github.com/MooreThreads/vllm-musa). Install torch/vllm per upstream README_vllm_musa.md; requirements.txt adds benchmark deps only. Sets device=musa; BF16 maps to float16 on vLLM <0.10. MCCL tensor parallelism. FP8 unsupported.", + "supersedes_chain": [], + "notes": "MMLU not at baseline on tested vLLM 0.4.x+musa stack — see runner README.", + "created": "2026-05-18", + "hardware_label": null, + "suite_support": { + "A": "pending", + "B": "pending", + "C": "pending", + "D": "pending", + "E": "pending", + "F": "pending", + "G": "unsupported" + } +} diff --git a/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt new file mode 100644 index 0000000..1fe16ee --- /dev/null +++ b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt @@ -0,0 +1,22 @@ +# AccelMark — moorethreads_vllm_musa_f2f6f965 +# +# AccelMark benchmark dependencies only. Install MUSA toolkit, torch_musa, and +# vllm-musa first — see README.md and https://github.com/MooreThreads/vllm-musa +# +# pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt +# python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py + +# AccelMark / loadgen +numpy==1.26.4 +jsonschema==4.25.1 +psutil==7.1.0 +tqdm==4.67.1 +aiohttp==3.12.15 +PyYAML==6.0.2 + +# Tokenizer / config (pin to match vLLM 0.4.x — see README) +transformers>=4.43.0,<4.47.0 +tokenizers>=0.20.0,<0.21.0 +huggingface-hub>=0.26.0,<0.27.0 +accelerate>=1.2.0,<1.3.0 +safetensors>=0.4.5,<0.5.0 diff --git a/runners/moorethreads_vllm_musa_57ff5443/runner.py b/runners/moorethreads_vllm_musa_f2f6f965/runner.py similarity index 52% rename from runners/moorethreads_vllm_musa_57ff5443/runner.py rename to runners/moorethreads_vllm_musa_f2f6f965/runner.py index d753330..b693369 100644 --- a/runners/moorethreads_vllm_musa_57ff5443/runner.py +++ b/runners/moorethreads_vllm_musa_f2f6f965/runner.py @@ -1,59 +1,8 @@ """ -AccelMark — Moore Threads MUSA GPU benchmark runner (vllm-musa). - -Implements BenchmarkRunner for vLLM on Moore Threads MUSA GPUs via the -``vllm-musa`` platform plugin. All orchestration logic lives in -``runners/benchmark_runner.py``. - -The plugin works by patching vLLM at import time: - - ``torchada`` aliases the CUDA Python API onto MUSA - - ``pymtml`` (mthreads-ml-py) provides device queries equivalent to - nvidia-ml-py - - A few Triton attention/worker patches are applied to make the standard - vLLM kernels run on MUSA's Triton compiler. - -As a result, the standard vLLM Python API (``LLM``, ``AsyncLLMEngine``, -``SamplingParams``) is fully preserved. This runner is therefore structurally -identical to the NVIDIA / AMD / Ascend vLLM runners — the differences are -in capability flags, device-count detection, and memory teardown. - -Hardware: Moore Threads MTT S4000 / S5000 (and forward-compatible - successors). S3000 / S80 may also work but are not the public - reference target. -Runtime: MUSA (Meta-computing Unified System Architecture) -Framework: vllm-musa — https://github.com/MooreThreads/vllm-musa - (also published on PyPI as ``vllm-musa``) -Precision: BF16 (preferred on S4000+), FP16 fallback. FP8 not yet - supported on shipping MUSA hardware. -Quantization: compressed-tensors (W8A8 / W8A16) declared by default. AWQ / - GPTQ / FP8 may be added once validated on real hardware. -Multi-chip: Tensor parallelism via MCCL (Moore Threads Collective - Communications Library). vLLM's tensor_parallel_size flag works - unchanged because torchada aliases the NCCL API surface. -Streaming: Fully supported — AsyncLLMEngine API is identical to vLLM. - -Installation (without a real device this is "informational"; final -versions to be confirmed at smoke-test time): - - # 1. Install the MUSA toolkit + driver matching your card firmware: - # https://developer.mthreads.com/musa/ - # 2. Install Moore Threads' PyTorch build (torch + torchada) inside the - # official MUSA container, then: - pip install -r runners/moorethreads_vllm_musa_{hash8}/requirements.txt - -Usage: - - # S5000 single chip - python run.py --runner moorethreads_vllm_musa_{hash8} --suite suite_F - - # Multi-chip tensor parallelism (e.g. 8 x S5000) - VLLM_WORKER_MULTIPROC_METHOD=spawn \ - python run.py --runner moorethreads_vllm_musa_{hash8} \ - --suite suite_B --tensor-parallel-size 8 - -Environment variables you might want to set: - MUSA_VISIBLE_DEVICES — equivalent to CUDA_VISIBLE_DEVICES - VLLM_WORKER_MULTIPROC_METHOD=spawn — recommended for multi-process workers +AccelMark — Moore Threads MUSA vLLM benchmark runner (vllm-musa). + +Implements BenchmarkRunner for vllm-musa on Moore Threads MUSA GPUs. +See README.md in this folder for install and hardware notes. """ import asyncio @@ -63,74 +12,38 @@ from pathlib import Path from typing import Optional -# Add repo root to path _REPO_ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(_REPO_ROOT)) from runners.benchmark_runner import BenchmarkRunner, InferenceRequest from loadgen.types import InferenceResult - import logging logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING) logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING) class MoorethreadsVLLMMUSARunner(BenchmarkRunner): - """ - AccelMark benchmark runner using ``vllm-musa`` on Moore Threads MUSA GPUs. - - ``vllm-musa`` is registered as a vLLM platform plugin and is auto-detected - on ``import vllm``. The plugin activates the MUSA backend when: - - the plugin package is installed in the environment - - Moore Threads devices are visible to the process - - The inference methods below are byte-for-byte identical in shape to the - NVIDIA vLLM runner — platform-specific logic is isolated to - ``_get_chip_count()``, ``load_model()``, ``get_peak_memory_gb()``, and - ``release_resources()``. - """ + """vLLM on Moore Threads MUSA via vllm-musa.""" SUPPORTS_STREAMING = True SUPPORTS_BATCHING = True SUPPORTS_ONLINE = True - SUPPORTS_MULTI_CHIP = True # MCCL-based tensor parallelism on multi-card hosts - - # S4000 / S5000 advertise native BF16 for LLM workloads; FP16 always works - # as a fallback. FP32 is left in the list for completeness but is rarely - # used for inference. FP8 is excluded entirely — current shipping MUSA - # hardware does not expose native FP8 datapaths. - SUPPORTED_PRECISIONS = ["bf16", "fp16", "fp32"] - - # Quantization backends — start conservative. compressed-tensors is the - # safe default on every modern vLLM build because the kernels are pure - # Triton + PyTorch matmuls and so are reachable through torchada. - # Marlin / AWQ-CUDA / native FP8 require kernel-level validation on MUSA - # and should be added in a follow-up runner version after real-hardware - # smoke tests, not silently flipped on here. + SUPPORTS_MULTI_CHIP = True + + SUPPORTED_PRECISIONS = ["bf16", "fp16"] SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors"] + _musa_runtime_prepared = False + def __init__(self): - self.llm = None # vllm.LLM (offline / accuracy) - self.engine = None # vllm.AsyncLLMEngine (online / interactive) + self.llm = None + self.engine = None self.tokenizer = None self.sampling_params = None self._loop: asyncio.AbstractEventLoop = None - # ── Metadata ───────────────────────────────────────────────────────────── - def _get_chip_count(self) -> int: - """Return the number of available Moore Threads MUSA GPUs. - - Preference order: - 1. ``pymtml`` (the Moore Threads management library, equivalent to - nvidia-ml-py). Most reliable because it queries the driver - directly and is not affected by ``MUSA_VISIBLE_DEVICES`` if - called before any ``torch`` initialisation. - 2. ``torch.cuda.device_count()`` — torchada aliases ``torch.cuda`` - to MUSA so this returns the visible MUSA device count in the - current process (respecting ``MUSA_VISIBLE_DEVICES``). - """ try: import pymtml pymtml.mtmlInit() @@ -145,7 +58,6 @@ def _get_chip_count(self) -> int: return int(n) except Exception: pass - try: import torch n = torch.cuda.device_count() @@ -154,17 +66,9 @@ def _get_chip_count(self) -> int: return 1 def _get_framework_name(self) -> str: - # The leaderboard groups by framework string; keep this distinct from - # plain "vLLM" so MUSA results are not silently mixed with CUDA results. return "vllm-musa" def _get_framework_version(self) -> str: - """Report vllm-musa plugin version, with vLLM core version appended. - - The plugin version is the meaningful identifier (it pins the patch - set), but the underlying vLLM core version is what generates kernels - and parses configs. Reporting both makes results reproducible. - """ plugin_version = "unknown" try: from importlib.metadata import version @@ -175,14 +79,11 @@ def _get_framework_version(self) -> str: plugin_version = getattr(vllm_musa_platform, "__version__", "unknown") except Exception: pass - - core_version = "unknown" try: import vllm core_version = vllm.__version__ except Exception: - pass - + core_version = "unknown" if plugin_version == "unknown" and core_version == "unknown": return "unknown" if plugin_version == "unknown": @@ -192,20 +93,44 @@ def _get_framework_version(self) -> str: def get_model_format(self) -> str: return "HuggingFace original" - # ── Model loading ──────────────────────────────────────────────────────── + @classmethod + def _prepare_musa_runtime(cls) -> None: + if cls._musa_runtime_prepared: + return + import torch # noqa: F401 + cls._musa_runtime_prepared = True - def load_model(self, model_path: str, parallelism: dict) -> None: - """ - Load model onto Moore Threads MUSA GPU(s) via vllm-musa. + @staticmethod + def _legacy_vllm_musa() -> bool: + try: + import vllm + ver = vllm.__version__.split("+")[0] + major, minor = (int(x) for x in ver.split(".")[:2]) + return (major, minor) < (0, 10) + except Exception: + return True - vllm-musa uses the standard vLLM ``LLM`` / ``AsyncLLMEngine`` - constructors. The MUSA backend activates automatically when the - plugin package is installed and Moore Threads devices are present — - no explicit device flag is required in engine kwargs. + @staticmethod + def _get_engine_arg_fields() -> set[str]: + try: + import dataclasses + from vllm.engine.arg_utils import EngineArgs + return {f.name for f in dataclasses.fields(EngineArgs)} + except Exception: + return set() + + def _resolve_musa_dtype(self, dtype: str, precision: str) -> str: + if not self._legacy_vllm_musa(): + return dtype + if dtype in ("bfloat16", "auto") or precision.upper() == "BF16": + if dtype != "float16": + print(" Note: vLLM 0.4.x+musa — using float16") + return "float16" + return dtype + + def load_model(self, model_path: str, parallelism: dict) -> None: + self._prepare_musa_runtime() - Pipeline parallelism is not supported (matches the vLLM CUDA backend - behaviour). Use ``tensor_parallel_size`` for multi-chip runs. - """ from transformers import AutoTokenizer from vllm import LLM, AsyncLLMEngine, SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs @@ -214,9 +139,7 @@ def load_model(self, model_path: str, parallelism: dict) -> None: pp_size = parallelism["pipeline_parallel_size"] ep_size = parallelism.get("expert_parallel_size", 1) assert pp_size <= 1, ( - "Pipeline parallelism (pp_size > 1) is not supported in " - "MoorethreadsVLLMMUSARunner. Use --tensor-parallel-size for " - "multi-chip runs." + "Pipeline parallelism is not supported. Use --tensor-parallel-size." ) max_tokens = parallelism["max_tokens"] @@ -226,33 +149,22 @@ def load_model(self, model_path: str, parallelism: dict) -> None: cfg = getattr(self, "_runner_config", {}) max_num_seqs = cfg.get("max_num_seqs", 256) - # vLLM's flag name is gpu_memory_utilization, but on MUSA it controls - # the per-card HBM fraction reserved for the KV cache. We keep the - # vLLM name to stay schema-compatible with other runners' configs. musa_memory_util = cfg.get("gpu_memory_utilization", 0.85) extra_kwargs = dict(cfg.get("engine_kwargs") or {}) - # Filter engine_kwargs to only fields the installed vLLM version - # accepts. EngineArgs is a strict dataclass — unknown kwargs raise - # TypeError at construction. vllm-musa supports vLLM 0.10.x and 0.13.x, - # whose EngineArgs fields differ slightly; filtering keeps the YAML - # forward-compatible. - try: - import dataclasses - from vllm.engine.arg_utils import EngineArgs as _EngineArgs - _valid = {f.name for f in dataclasses.fields(_EngineArgs)} - _dropped = {k: v for k, v in extra_kwargs.items() if k not in _valid} + _valid_engine_fields = self._get_engine_arg_fields() + if _valid_engine_fields: + _dropped = {k: v for k, v in extra_kwargs.items() + if k not in _valid_engine_fields} if _dropped: print(f" Warning: engine_kwargs keys not supported by this " f"vllm-musa / vLLM version and will be ignored: " f"{list(_dropped)}") - extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid} - except Exception: - pass + extra_kwargs = {k: v for k, v in extra_kwargs.items() + if k in _valid_engine_fields} effective_precision = getattr(self, "_effective_precision", "BF16").upper() precision = getattr(self, "_precision", None) or effective_precision - _dtype_override = getattr(self, "_precision_dtype_override", None) _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {}) quantization = _prec_eng_kwargs.pop("quantization", None) @@ -263,13 +175,11 @@ def load_model(self, model_path: str, parallelism: dict) -> None: if _dtype_override: dtype = _dtype_override + dtype = self._resolve_musa_dtype(dtype, precision) if _prec_eng_kwargs: _prec_eng_kwargs.update(extra_kwargs) extra_kwargs = _prec_eng_kwargs - # Translate the runner's flat speculative-decoding keys into the - # dict-form ``speculative_config`` used by recent vLLM versions. Skip - # if the user already provided ``speculative_config`` directly. if "speculative_model" in extra_kwargs and "speculative_config" not in extra_kwargs: extra_kwargs["speculative_config"] = { "model": extra_kwargs.pop("speculative_model"), @@ -288,11 +198,7 @@ def load_model(self, model_path: str, parallelism: dict) -> None: self.tokenizer = AutoTokenizer.from_pretrained( model_path, trust_remote_code=False ) - - self.sampling_params = SamplingParams( - max_tokens=max_tokens, - temperature=0.0, - ) + self.sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) base_kwargs = dict( model=model_path, @@ -301,6 +207,8 @@ def load_model(self, model_path: str, parallelism: dict) -> None: trust_remote_code=False, enforce_eager=enforce_eager, ) + if not _valid_engine_fields or "device" in _valid_engine_fields: + base_kwargs["device"] = "musa" if ep_size > 1: base_kwargs["enable_expert_parallel"] = True if quantization: @@ -326,25 +234,16 @@ def load_model(self, model_path: str, parallelism: dict) -> None: self.engine = AsyncLLMEngine.from_engine_args(engine_args) def get_effective_dtype(self) -> Optional[str]: - """Report the actual compute dtype vllm-musa resolved after loading.""" try: if self.llm is not None: return str(self.llm.llm_engine.model_config.dtype).replace("torch.", "") - elif self.engine is not None: + if self.engine is not None: return str(self.engine.engine.model_config.dtype).replace("torch.", "") except Exception: pass return getattr(self, "_effective_dtype", None) - # ── Inference ──────────────────────────────────────────────────────────── - - def inference_fn_offline( - self, requests: list[InferenceRequest] - ) -> list[InferenceResult]: - """ - Synchronous batch inference via vllm-musa LLM.generate(). - total_time_ms is wall-clock elapsed time for the full batch. - """ + def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]: formatted = [self._format_prompt(r.prompt) for r in requests] t_start = time.perf_counter() outputs = self.llm.generate(formatted, self.sampling_params) @@ -352,22 +251,19 @@ def inference_fn_offline( self._last_accuracy_outputs = [o.outputs[0].text for o in outputs] - results = [] - for output in outputs: - results.append(InferenceResult( + return [ + InferenceResult( first_token_time_ms=None, total_time_ms=elapsed * 1000, - output_tokens=len(output.outputs[0].token_ids), - input_tokens=len(output.prompt_token_ids), + output_tokens=len(o.outputs[0].token_ids), + input_tokens=len(o.prompt_token_ids), success=True, - output_text=output.outputs[0].text, - )) - return results - - async def inference_fn_streaming( - self, request: InferenceRequest - ) -> InferenceResult: - """Async streaming for TTFT — API identical to NVIDIA vLLM runner.""" + output_text=o.outputs[0].text, + ) + for o in outputs + ] + + async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult: from vllm.utils import random_uuid formatted = self._format_prompt(request.prompt) @@ -380,18 +276,14 @@ async def inference_fn_streaming( async for output in self.engine.generate( formatted, self.sampling_params, request_id ): - if ( - first_token_time_ms is None - and len(output.outputs[0].token_ids) > 0 - ): + if first_token_time_ms is None and len(output.outputs[0].token_ids) > 0: first_token_time_ms = (time.perf_counter() - t_start) * 1000 output_tokens = len(output.outputs[0].token_ids) output_text = output.outputs[0].text - total_time_ms = (time.perf_counter() - t_start) * 1000 return InferenceResult( first_token_time_ms=first_token_time_ms, - total_time_ms=total_time_ms, + total_time_ms=(time.perf_counter() - t_start) * 1000, output_tokens=output_tokens, input_tokens=0, success=True, @@ -399,7 +291,6 @@ async def inference_fn_streaming( ) async def inference_fn_token_stream(self, request: InferenceRequest): - """Async generator yielding text deltas for serve-layer SSE.""" from vllm.utils import random_uuid formatted = self._format_prompt(request.prompt) @@ -415,22 +306,12 @@ async def inference_fn_token_stream(self, request: InferenceRequest): yield delta prev_length = len(current_text) - # ── Memory & teardown ──────────────────────────────────────────────────── - def get_peak_memory_gb(self) -> Optional[float]: - """Query peak HBM usage on the active MUSA device. - - torchada aliases ``torch.cuda.max_memory_allocated()`` onto MUSA, so - the standard CUDA API returns peak MUSA memory. We fall back to - ``pymtml`` if torch is unavailable for some reason. - """ try: import torch return torch.cuda.max_memory_allocated() / (1024 ** 3) except Exception: pass - # pymtml fallback — returns currently-used memory, not strictly peak, - # but useful when torch.cuda is gone. try: import pymtml pymtml.mtmlInit() @@ -450,16 +331,6 @@ def get_peak_memory_gb(self) -> Optional[float]: return None def release_resources(self) -> None: - """ - Release vllm-musa engines and MUSA memory. - - Teardown order mirrors the NVIDIA runner: - 1. Shut down async engine (if online/interactive was used) - 2. Delete engine objects to trigger Python GC - 3. vLLM distributed-state cleanup (cleanup_dist_env_and_memory) - 4. MCCL / torch.distributed process group destruction - 5. MUSA memory cache flush via torch.cuda (aliased to MUSA by torchada) - """ if self.llm is not None: try: del self.llm @@ -479,9 +350,6 @@ def release_resources(self) -> None: pass self.engine = None - # vLLM distributed state cleanup. cleanup_dist_env_and_memory is the - # same entry point as upstream vLLM — vllm-musa patches the internals - # but keeps the public function name. try: from vllm.distributed.parallel_state import cleanup_dist_env_and_memory cleanup_dist_env_and_memory(shutdown_ray=False) @@ -496,10 +364,6 @@ def release_resources(self) -> None: except Exception: pass - # Destroy the active torch.distributed process group. On MUSA the - # backend is MCCL (Moore Threads Collective Communications Library) - # but is exposed through the standard torch.distributed.destroy_process_group - # entry point thanks to torchada. try: import torch if torch.distributed.is_initialized(): @@ -509,8 +373,6 @@ def release_resources(self) -> None: gc.collect() - # Flush MUSA memory cache. torch.cuda.* is aliased to MUSA by torchada, - # so the standard CUDA cache-management APIs work without modification. try: import torch torch.cuda.empty_cache() @@ -518,10 +380,8 @@ def release_resources(self) -> None: except Exception: pass - # ── Argument parsing ───────────────────────────────────────────────────── - def parse_args(self): - """Add vllm-musa / Moore Threads-specific CLI flags.""" + """Add vllm-musa-specific CLI flags. Base class pre-loads runner config.""" args = super().parse_args() cfg = self._runner_config @@ -541,24 +401,29 @@ def parse_args(self): ep_size = (extra.expert_parallel_size if extra.expert_parallel_size is not None else cfg.get("expert_parallel_size", 1)) - self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False) print(f" tensor_parallel_size = {tp_size} [{_tp_source}]") if ep_size > 1: print(f" expert_parallel_size = {ep_size} [cli/yaml]") + if not self.SUPPORTS_MULTI_CHIP and tp_size > 1: + print(f"Warning: {self.__class__.__name__} does not support multi-chip. " + f"Ignoring tensor_parallel_size={tp_size}, using 1.") + tp_size = 1 + ep_size = 1 + self._parallelism = { - "tensor_parallel_size": tp_size, + "tensor_parallel_size": tp_size, "pipeline_parallel_size": 1, - "expert_parallel_size": ep_size, - "data_parallel_size": 1, + "expert_parallel_size": ep_size, + "data_parallel_size": 1, } self._chip_count = tp_size + self._precision = getattr(args, "precision", None) return args def get_extra_subprocess_args(self, args) -> list[str]: - """Forward vllm-musa / Moore Threads-specific flags to subprocesses.""" extra = [ "--tensor-parallel-size", str(self._parallelism.get("tensor_parallel_size", 1)), diff --git a/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py new file mode 100644 index 0000000..86cbbf9 --- /dev/null +++ b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +""" +Standalone vllm-musa smoke test (does not use the AccelMark runner). + +Usage (from repo root): + + python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py + python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model + + MODEL_PATH=/path/to/Qwen2.5-0.5B-Instruct \\ + python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py +""" + +from __future__ import annotations + +import gc +import os +import sys +import time + +import torch # noqa: F401 — before transformers/vllm (libstdc++ load order) + +from vllm import LLM, SamplingParams + +_DEFAULT_MODEL = os.getenv("MODEL_PATH", "Qwen/Qwen2.5-0.5B-Instruct") + +PROMPTS = [ + "The capital of France is", + "Say hello in one short sentence.", +] + + +def main() -> int: + model_path = sys.argv[1] if len(sys.argv) > 1 else _DEFAULT_MODEL + + sampling_params = SamplingParams(temperature=0.0, max_tokens=64) + + print(f"Loading {model_path} ...") + t_load = time.perf_counter() + llm = LLM( + model=model_path, + device="musa", + dtype="float16", + tensor_parallel_size=1, + max_model_len=1024, + max_num_seqs=4, + gpu_memory_utilization=0.85, + trust_remote_code=False, + ) + print(f"Model loaded in {time.perf_counter() - t_load:.1f}s\n") + + t_infer = time.perf_counter() + outputs = llm.generate(PROMPTS, sampling_params) + print(f"Inference done in {time.perf_counter() - t_infer:.1f}s\n") + + for prompt, output in zip(PROMPTS, outputs): + text = output.outputs[0].text + n_tokens = len(output.outputs[0].token_ids) + print(f"Prompt: {prompt!r}") + print(f"Output: {text!r}") + print(f"Tokens: {n_tokens}\n") + + del llm + gc.collect() + try: + if hasattr(torch, "musa"): + torch.musa.empty_cache() + else: + torch.cuda.empty_cache() + except Exception: + pass + print("Done.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/runners/platforms/moorethreads.py b/runners/platforms/moorethreads.py index 708db1b..9f55684 100644 --- a/runners/platforms/moorethreads.py +++ b/runners/platforms/moorethreads.py @@ -1,17 +1,13 @@ """Moore Threads MUSA GPU platform plug-in. -Moore Threads ships its own driver and management tooling: - -* ``mthreads-gmi`` — the moral equivalent of ``nvidia-smi`` / ``rocm-smi``. -* ``pymtml`` — Python bindings analogous to NVML / pynvml. -* ``torchada`` — a CUDA→MUSA compatibility shim that exposes the standard - ``torch.cuda`` API, with the real backend version available via - ``torch.version.musa``. - -This plug-in first tries the Python bindings (best machine-readable -output) and falls back to scraping ``mthreads-gmi`` text output. Both -paths are best-effort: when none of the tools are installed the plug-in -silently reports zero accelerators and the collector moves on. +Used by ``runners/collect_env.py`` to populate ``env_info.json``. + +Detection order (first non-empty wins): + + 1. ``pymtml`` (mthreads-ml-py) — same API as used in the vllm-musa runner + 2. ``mthreads-gmi`` text output + 3. ``torch`` device properties (``torch.cuda`` aliased to MUSA via torchada, + or native ``torch.musa`` when available) """ from __future__ import annotations @@ -23,8 +19,6 @@ VENDOR_LABEL = "Moore Threads" PRIORITY = 60 -# S5000 / S4000 datacenter SKUs ship with native BF16 support; the older -# consumer-class MTT S80/S70 cards are FP16-only. _BF16_SUPPORTED_HINTS = ("s5000", "s4000", "s3000") _NO_BF16_HINTS = ("s80", "s70", "s60", "s50") @@ -40,50 +34,68 @@ def _supports_bf16(chip_name: str) -> bool: return True +def _driver_version_from_smi() -> str | None: + try: + out = subprocess.check_output( + ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL + ) + m = re.search(r"Driver\s+Version\s*:\s*(\S+)", out, re.IGNORECASE) + if m: + return m.group(1) + except Exception: + pass + return None + + def _collect_via_pymtml() -> list[dict]: try: - import pymtml as mtml # type: ignore[import-not-found] + import pymtml except ImportError: return [] try: - mtml.mtmlInit() + pymtml.mtmlInit() except Exception: return [] + driver = _driver_version_from_smi() or "unknown" accelerators: list[dict] = [] try: - count = mtml.mtmlDeviceGetCount() + count = pymtml.mtmlDeviceGetCount() except Exception: try: - mtml.mtmlShutdown() + pymtml.mtmlShutdown() except Exception: pass return [] for idx in range(int(count)): try: - handle = mtml.mtmlDeviceGetHandleByIndex(idx) - name = mtml.mtmlDeviceGetName(handle) - mem = mtml.mtmlDeviceGetMemoryInfo(handle) - total_mb = getattr(mem, "total", None) or mem.get("total", 0) - driver = mtml.mtmlSystemGetDriverVersion() + dev = pymtml.mtmlDeviceGetByIndex(idx) + name = pymtml.mtmlDeviceGetName(dev) + mem = pymtml.mtmlDeviceGetMemoryInfo(dev) + total_bytes = getattr(mem, "total", None) + if total_bytes is None and isinstance(mem, dict): + total_bytes = mem.get("total") except Exception: continue + if not isinstance(name, str): + name = name.decode("utf-8", "ignore") + memory_gb = round(int(total_bytes) / (1024 ** 3), 1) if total_bytes else None accelerators.append( { "index": idx, - "name": name if isinstance(name, str) else name.decode("utf-8", "ignore"), + "name": name, "vendor": VENDOR_LABEL, - "memory_gb": round(int(total_mb) / 1024, 1) if total_mb else None, - "driver_version": driver if isinstance(driver, str) else driver.decode("utf-8", "ignore"), + "memory_gb": memory_gb, + "driver_version": driver, "firmware_version": None, - "supports_bf16": _supports_bf16(str(name)), + "supports_bf16": _supports_bf16(name), } ) try: - mtml.mtmlShutdown() + pymtml.mtmlShutdown() except Exception: pass @@ -91,12 +103,7 @@ def _collect_via_pymtml() -> list[dict]: def _collect_via_smi() -> list[dict]: - """Fallback parser for ``mthreads-gmi`` text output. - - The output format mirrors nvidia-smi: a header with the driver / MUSA - versions followed by per-device blocks listing the product name and - memory usage. We only need the device name and total memory. - """ + """Parse ``mthreads-gmi`` text output (mthreads-gmi 1.14+ tabular format).""" try: out = subprocess.check_output( ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL @@ -110,21 +117,18 @@ def _collect_via_smi() -> list[dict]: driver = m.group(1) accelerators: list[dict] = [] - # Per-device rows look like: - # | 0 MTT S4000 ... | 0000:65:00.0 Off | ... | - # followed by: - # | 0% 45C P0 ... / ... | 234MiB / 49152MiB | ... | + # Example row: + # 0 MTT S4000 |00000000:28:00.0 |0% 4MiB(49152MiB) for match in re.finditer( - r"\|\s*(\d+)\s+(MTT\s+\S+(?:\s+\S+)?)\s*", out + r"^(\d+)\s+(MTT\s+\S+)\s+\|", + out, + re.MULTILINE, ): idx = int(match.group(1)) name = match.group(2).strip() - # Search downstream of this match for the memory line - tail = out[match.end():] - mem_match = re.search(r"(\d+)MiB\s*/\s*(\d+)MiB", tail) - memory_gb = None - if mem_match: - memory_gb = round(int(mem_match.group(2)) / 1024, 1) + tail = out[match.end(): match.end() + 256] + mem_match = re.search(r"\d+MiB\((\d+)MiB\)", tail) + memory_gb = round(int(mem_match.group(1)) / 1024, 1) if mem_match else None accelerators.append( { "index": idx, @@ -139,23 +143,69 @@ def _collect_via_smi() -> list[dict]: return accelerators +def _collect_via_torch() -> list[dict]: + """Fallback when management libraries are missing but torch MUSA is loaded.""" + try: + import torch + except ImportError: + return [] + + driver = _driver_version_from_smi() or "unknown" + accelerators: list[dict] = [] + + if hasattr(torch, "musa"): + try: + count = torch.musa.device_count() + get_props = torch.musa.get_device_properties + except Exception: + count = 0 + get_props = None + else: + try: + count = torch.cuda.device_count() + get_props = torch.cuda.get_device_properties + except Exception: + return [] + + for idx in range(int(count)): + try: + props = get_props(idx) + name = getattr(props, "name", None) or f"MTT GPU {idx}" + total = getattr(props, "total_memory", None) + memory_gb = round(total / (1024 ** 3), 1) if total else None + except Exception: + continue + accelerators.append( + { + "index": idx, + "name": name if isinstance(name, str) else str(name), + "vendor": VENDOR_LABEL, + "memory_gb": memory_gb, + "driver_version": driver, + "firmware_version": None, + "supports_bf16": _supports_bf16(str(name)), + } + ) + return accelerators + + def collect() -> list[dict]: - accelerators = _collect_via_pymtml() - if accelerators: - return accelerators - return _collect_via_smi() + for fn in (_collect_via_pymtml, _collect_via_smi, _collect_via_torch): + accelerators = fn() + if accelerators: + return accelerators + return [] def detect_runtime_version() -> str | None: - """Prefer torch.version.musa (most reliable when torchada is installed), - fall back to scraping ``mthreads-gmi`` header. - """ try: import torch ver = getattr(torch.version, "musa", None) if ver: return f"MUSA {ver}" + if getattr(torch.version, "cuda", None): + return f"MUSA (torch.cuda shim) {torch.version.cuda}" except ImportError: pass @@ -174,17 +224,43 @@ def detect_runtime_version() -> str | None: return None +def detect_pcie_gen() -> str | None: + try: + out = subprocess.check_output( + ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL + ) + m = re.search(r"\|\s*(\d+)x\((\d+)x\)\s*\|", out) + if m: + return f"PCIe {m.group(1)}x/{m.group(2)}x" + except Exception: + pass + return None + + +def detect_intra_node_interconnect() -> str | None: + """Moore Threads multi-GPU hosts typically use MCCL over PCIe.""" + accels = collect() + if len(accels) > 1: + return "MCCL/PCIe" + return None + + def diagnostics(env: dict, accelerators: list[dict]) -> list[str]: notes: list[str] = [] - if accelerators and (env.get("pytorch_version") or "") == "unknown": + if not accelerators: + notes.append( + "No Moore Threads MUSA GPUs detected (tried pymtml, mthreads-gmi, " + "and torch). Install the MUSA driver/toolkit per " + "https://github.com/MooreThreads/vllm-musa ." + ) + return notes + if (env.get("pytorch_version") or "") == "unknown": notes.append( - "PyTorch (with the torchada MUSA shim) is not installed — " - "pytorch_version is unknown." + "PyTorch with MUSA support is not installed — pytorch_version is unknown." ) - if accelerators and (env.get("runtime_version") or "") == "unknown": + if (env.get("runtime_version") or "") == "unknown": notes.append( "Could not detect MUSA runtime (tried torch.version.musa and " - "mthreads-gmi). runtime_version is unknown — install torchada " - "or the Moore Threads MUSA toolkit." + "mthreads-gmi). runtime_version is unknown." ) return notes From 03e30bddd5899b596eadec2283da6f56f632ce8b Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Mon, 18 May 2026 18:05:54 +0800 Subject: [PATCH 3/5] add moore schema --- schema/env.schema.json | 2 +- schema/result.schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/schema/env.schema.json b/schema/env.schema.json index 60fc5e8..e80cd94 100644 --- a/schema/env.schema.json +++ b/schema/env.schema.json @@ -16,7 +16,7 @@ "properties": { "index": { "type": "integer" }, "name": { "type": "string" }, - "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple'" }, + "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple', 'Moore Threads'" }, "memory_gb": { "type": ["number","null"], "minimum": 0 }, "driver_version": { "type": "string" }, "firmware_version": { "type": ["string","null"] }, diff --git a/schema/result.schema.json b/schema/result.schema.json index 99a0517..fb81a8a 100644 --- a/schema/result.schema.json +++ b/schema/result.schema.json @@ -36,7 +36,7 @@ "vendor": { "type": "string", "enum": ["NVIDIA","AMD","Intel","Google","Huawei","Cambricon","Biren", - "Enflame","MetaX","Iluvatar","Apple","Qualcomm","Other"] + "Enflame","MetaX","Moore Threads","Iluvatar","Apple","Qualcomm","Other"] }, "count": { "type": "integer", "minimum": 1 }, "memory_gb": { "type": "number", "minimum": 0 }, From 7e8c3df569fab441331e0aae0788f8b0d460d2e6 Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Mon, 18 May 2026 18:09:21 +0800 Subject: [PATCH 4/5] upload moore results --- .../accuracy/accuracy.json | 8 + .../env_info.json | 48 ++++ .../offline/result.json | 164 +++++++++++++ .../online/result.json | 163 +++++++++++++ .../result.json | 215 ++++++++++++++++++ .../accuracy/accuracy.json | 8 + .../env_info.json | 48 ++++ .../interactive/result.json | 131 +++++++++++ .../offline/result.json | 164 +++++++++++++ .../online/result.json | 151 ++++++++++++ .../result.json | 215 ++++++++++++++++++ 11 files changed, 1315 insertions(+) create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json create mode 100644 results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json create mode 100644 results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json new file mode 100644 index 0000000..7242234 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.07, + "baseline_delta": -0.53, + "valid": false, + "framework": "vllm-musa", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json new file mode 100644 index 0000000..4244ef7 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json @@ -0,0 +1,48 @@ +{ + "collected_at": "2026-05-18T09:21:31.092840+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json new file mode 100644 index 0000000..a050fe4 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json @@ -0,0 +1,164 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:21:31.092840+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 332.62, + "throughput_tokens_per_sec_per_chip": 332.62, + "throughput_tokens_per_sec_total": 922.83, + "elapsed_seconds_median": 43.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 331.64, + "throughput_tokens_per_sec_per_chip": 331.64, + "throughput_tokens_per_sec_total": 920.1, + "elapsed_seconds_median": 43.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 331.76, + "throughput_tokens_per_sec_per_chip": 331.76, + "throughput_tokens_per_sec_total": 920.46, + "elapsed_seconds_median": 43.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "17:34:52", + "run_id": "cabb7bd0", + "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00", + "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00", + "benchmark_elapsed_minutes": 8.7, + "model_load_seconds": 116.8 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json new file mode 100644 index 0000000..064d6b8 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json @@ -0,0 +1,163 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:21:31.092840+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 5, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 194.45, + "ttft_ms_p90": 315.05, + "ttft_ms_p99": 424.55, + "tpot_ms_p50": 201.93, + "tpot_ms_p90": 253.8, + "tpot_ms_p99": 471.28, + "elapsed_seconds_median": 137.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 4796.14, + "ttft_ms_p90": 8459.18, + "ttft_ms_p99": 9348.86, + "tpot_ms_p50": 355.01, + "tpot_ms_p90": 6430.04, + "tpot_ms_p99": 15579.83, + "elapsed_seconds_median": 93.0, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 10354.27, + "ttft_ms_p90": 17651.16, + "ttft_ms_p99": 19078.89, + "tpot_ms_p50": 849.82, + "tpot_ms_p90": 8677.79, + "tpot_ms_p99": 14281.03, + "elapsed_seconds_median": 90.0, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "17:53:38", + "run_id": "cabb7bd0", + "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T09:37:13.745117+00:00", + "benchmark_end_time": "2026-05-18T09:53:38.865501+00:00", + "benchmark_elapsed_minutes": 16.4, + "model_load_seconds": 122.7 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json new file mode 100644 index 0000000..e4b1093 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json @@ -0,0 +1,215 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:21:31.092840+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 332.62, + "throughput_tokens_per_sec_per_chip": 332.62, + "throughput_tokens_per_sec_total": 922.83, + "elapsed_seconds_median": 43.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 331.64, + "throughput_tokens_per_sec_per_chip": 331.64, + "throughput_tokens_per_sec_total": 920.1, + "elapsed_seconds_median": 43.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 331.76, + "throughput_tokens_per_sec_per_chip": 331.76, + "throughput_tokens_per_sec_total": 920.46, + "elapsed_seconds_median": 43.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 5, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 194.45, + "ttft_ms_p90": 315.05, + "ttft_ms_p99": 424.55, + "tpot_ms_p50": 201.93, + "tpot_ms_p90": 253.8, + "tpot_ms_p99": 471.28, + "elapsed_seconds_median": 137.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 4796.14, + "ttft_ms_p90": 8459.18, + "ttft_ms_p99": 9348.86, + "tpot_ms_p50": 355.01, + "tpot_ms_p90": 6430.04, + "tpot_ms_p99": 15579.83, + "elapsed_seconds_median": 93.0, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 10354.27, + "ttft_ms_p90": 17651.16, + "ttft_ms_p99": 19078.89, + "tpot_ms_p50": 849.82, + "tpot_ms_p90": 8677.79, + "tpot_ms_p99": 14281.03, + "elapsed_seconds_median": 90.0, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": 0.07, + "baseline_delta": -0.53, + "valid": false, + "framework": "vllm-musa", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "17:34:52", + "run_id": "cabb7bd0", + "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": "Partial run: ['offline', 'online'] succeeded, ['accuracy'] failed.", + "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00", + "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00", + "benchmark_elapsed_minutes": 25.1, + "model_load_seconds": 116.8, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.", + "scenario_dirs": { + "offline": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline", + "online": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online" + } + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json new file mode 100644 index 0000000..63c6e92 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.07, + "baseline_delta": -0.31, + "valid": false, + "framework": "vllm-musa", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json new file mode 100644 index 0000000..31f501b --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json @@ -0,0 +1,48 @@ +{ + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json new file mode 100644 index 0000000..4f5ff81 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json @@ -0,0 +1,131 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 25.89, + "ttft_ms_p90": 27.18, + "ttft_ms_p99": 28.51, + "tpot_ms_p50": 14.85, + "tpot_ms_p90": 15.17, + "tpot_ms_p99": 15.5, + "peak_memory_gb": null, + "elapsed_seconds_median": 481.4 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "17:21:09", + "run_id": "4f66d29d", + "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T08:56:46.686185+00:00", + "benchmark_end_time": "2026-05-18T09:21:09.800661+00:00", + "benchmark_elapsed_minutes": 24.4, + "model_load_seconds": 151.2 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json new file mode 100644 index 0000000..2498167 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json @@ -0,0 +1,164 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1994.51, + "throughput_tokens_per_sec_per_chip": 1994.51, + "throughput_tokens_per_sec_total": 3642.41, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1998.44, + "throughput_tokens_per_sec_per_chip": 1998.44, + "throughput_tokens_per_sec_total": 3649.59, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2004.02, + "throughput_tokens_per_sec_per_chip": 2004.02, + "throughput_tokens_per_sec_total": 3659.77, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "16:48:27", + "run_id": "4f66d29d", + "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00", + "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00", + "benchmark_elapsed_minutes": 2.5, + "model_load_seconds": 146.8 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json new file mode 100644 index 0000000..eb13372 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json @@ -0,0 +1,151 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 47.68, + "ttft_ms_p90": 96.31, + "ttft_ms_p99": 956.22, + "tpot_ms_p50": 47.25, + "tpot_ms_p90": 80.82, + "tpot_ms_p99": 131.63, + "elapsed_seconds_median": 37.8, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 94.5, + "ttft_ms_p90": 194.64, + "ttft_ms_p99": 331.88, + "tpot_ms_p50": 74.76, + "tpot_ms_p90": 287.01, + "tpot_ms_p99": 444.19, + "elapsed_seconds_median": 19.0, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "16:53:54", + "run_id": "4f66d29d", + "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T08:51:01.188901+00:00", + "benchmark_end_time": "2026-05-18T08:53:54.250762+00:00", + "benchmark_elapsed_minutes": 2.9, + "model_load_seconds": 132.6 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json new file mode 100644 index 0000000..a1c073d --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json @@ -0,0 +1,215 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1994.51, + "throughput_tokens_per_sec_per_chip": 1994.51, + "throughput_tokens_per_sec_total": 3642.41, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1998.44, + "throughput_tokens_per_sec_per_chip": 1998.44, + "throughput_tokens_per_sec_total": 3649.59, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2004.02, + "throughput_tokens_per_sec_per_chip": 2004.02, + "throughput_tokens_per_sec_total": 3659.77, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 47.68, + "ttft_ms_p90": 96.31, + "ttft_ms_p99": 956.22, + "tpot_ms_p50": 47.25, + "tpot_ms_p90": 80.82, + "tpot_ms_p99": 131.63, + "elapsed_seconds_median": 37.8, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 94.5, + "ttft_ms_p90": 194.64, + "ttft_ms_p99": 331.88, + "tpot_ms_p50": 74.76, + "tpot_ms_p90": 287.01, + "tpot_ms_p99": 444.19, + "elapsed_seconds_median": 19.0, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 25.89, + "ttft_ms_p90": 27.18, + "ttft_ms_p99": 28.51, + "tpot_ms_p50": 14.85, + "tpot_ms_p90": 15.17, + "tpot_ms_p99": 15.5, + "peak_memory_gb": null, + "elapsed_seconds_median": 481.4 + } + }, + "accuracy": { + "subset_score": 0.07, + "baseline_delta": -0.31, + "valid": false, + "framework": "vllm-musa", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "16:48:27", + "run_id": "4f66d29d", + "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": "Partial run: ['offline', 'online', 'interactive'] succeeded, ['accuracy'] failed.", + "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00", + "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00", + "benchmark_elapsed_minutes": 29.8, + "model_load_seconds": 146.8, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.", + "scenario_dirs": { + "offline": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline", + "online": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online", + "interactive": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive" + } + } +} \ No newline at end of file From d2e78a253e8686e5beb62c9228ea2ec8058f1745 Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Mon, 18 May 2026 18:19:24 +0800 Subject: [PATCH 5/5] update --- README.md | 2 +- runners/moorethreads_vllm_musa_f2f6f965/README.md | 4 ++-- runners/moorethreads_vllm_musa_f2f6f965/meta.json | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2ca3d64..3007966 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — | | Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — | -| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | +| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ✓ | ⋯ | ⋯ | ⋯ | ⋯ | ✓ | — | _Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._ diff --git a/runners/moorethreads_vllm_musa_f2f6f965/README.md b/runners/moorethreads_vllm_musa_f2f6f965/README.md index e963d18..5111bdc 100644 --- a/runners/moorethreads_vllm_musa_f2f6f965/README.md +++ b/runners/moorethreads_vllm_musa_f2f6f965/README.md @@ -7,12 +7,12 @@ AccelMark runner for Moore Threads MUSA GPUs using | Suite | Description | Notes | |-------|-------------|-------| -| Suite A | Single-chip, Llama-3-8B | Smoke tested on MTT S4000; accuracy not at baseline on vLLM 0.4.x | +| Suite A | Single-chip, Llama-3-8B | Validated on S4000 (default: accuracy/offline/online) | | Suite B | Multi-chip, Llama-3-70B | MCCL tensor parallelism; set `VLLM_WORKER_MULTIPROC_METHOD=spawn` | | Suite C | Quantization, Llama-3.1-8B | FP8 skipped (not supported); W8A8/W8A16 via compressed-tensors | | Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` / `gpu_memory_utilization` in runner config | | Suite E | Multi-chip scaling, Llama-3-8B | MCCL tensor parallelism | -| Suite F | Edge, Qwen2.5-0.5B | Smoke tested on MTT S4000; recommended first run | +| Suite F | Edge, Qwen2.5-0.5B | Validated on MTT S4000 (community result in repo) | | Suite G | MoE multi-chip, Mixtral-8x7B | Unsupported | ## Hardware compatibility diff --git a/runners/moorethreads_vllm_musa_f2f6f965/meta.json b/runners/moorethreads_vllm_musa_f2f6f965/meta.json index 9d5728d..e57d72d 100644 --- a/runners/moorethreads_vllm_musa_f2f6f965/meta.json +++ b/runners/moorethreads_vllm_musa_f2f6f965/meta.json @@ -6,16 +6,16 @@ "submitted_by": "JuhaoLiang1997", "description": "AccelMark runner for Moore Threads MUSA GPUs using vllm-musa (https://github.com/MooreThreads/vllm-musa). Install torch/vllm per upstream README_vllm_musa.md; requirements.txt adds benchmark deps only. Sets device=musa; BF16 maps to float16 on vLLM <0.10. MCCL tensor parallelism. FP8 unsupported.", "supersedes_chain": [], - "notes": "MMLU not at baseline on tested vLLM 0.4.x+musa stack — see runner README.", + "notes": "Smoke-tested on MTT S4000 (vLLM 0.4.2+musa): Suite A and F default scenarios run. MMLU not at baseline — see runner README.", "created": "2026-05-18", "hardware_label": null, "suite_support": { - "A": "pending", + "A": "validated", "B": "pending", "C": "pending", "D": "pending", "E": "pending", - "F": "pending", + "F": "validated", "G": "unsupported" } }