diff --git a/README.md b/README.md index ea9e2b6..3007966 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — | | Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — | +| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ✓ | ⋯ | ⋯ | ⋯ | ⋯ | ✓ | — | _Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._ diff --git a/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example new file mode 100644 index 0000000..c18f98b --- /dev/null +++ b/configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example @@ -0,0 +1,60 @@ +# AccelMark runner config — moorethreads_vllm_musa_f2f6f965 (vllm-musa on Moore Threads) +# +# Copy this file to runner_moorethreads_vllm_musa_f2f6f965.yaml (remove +# .example suffix) and edit as needed for your hardware. The actual .yaml +# is gitignored. +# +# These settings adapt the runner to your hardware environment. They are +# recorded in result.json task.extra_config for transparency but are NOT +# part of the benchmark identity (not hashed into run_id). +# +# Merge priority: CLI flags > suite-specific > global defaults > runner defaults + +# ── Global defaults (apply to all suites) ───────────────────────────────────── + +# Tensor parallel size — number of Moore Threads GPUs to use (default: 1). +# For multi-card runs make sure to export VLLM_WORKER_MULTIPROC_METHOD=spawn. +tensor_parallel_size: 1 + +# Disable Triton CUDA-graph / compilation. Set true if you hit Triton kernel +# errors on first request (most common on S3000 / S80 paths). +enforce_eager: false + +# Maximum number of sequences in a batch (default: 256). +# Reduce on lower-memory cards: 128 on 24 GB cards, 64 on 16 GB cards. +max_num_seqs: 256 + +# Fraction of MUSA HBM reserved for the KV cache (default: 0.85). Reduce if +# you hit OOM; the vLLM flag is named gpu_memory_utilization but applies to +# MUSA HBM via torchada. +gpu_memory_utilization: 0.85 + +# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs(). +# Unknown keys are dropped automatically with a warning, so this is safe to +# use across vLLM 0.10.x / 0.13.x. +# engine_kwargs: +# swap_space: 8 +# max_seq_len_to_capture: 4096 + +# ── Suite-specific overrides ─────────────────────────────────────────────────── + +suites: + suite_D: + # Long-context — reduce batch size and reserve more memory. + max_num_seqs: 32 + gpu_memory_utilization: 0.80 + + suite_F: + max_num_seqs: 128 + +# ── Speculative decoding (suite_A / suite_D extra scenario) ───────────────── +# Uncomment to enable. vllm-musa accepts the same speculative_config dict as +# upstream vLLM; the runner translates flat keys (speculative_model, +# num_speculative_tokens, ...) into speculative_config automatically. +# +# suites: +# suite_A: +# engine_kwargs: +# speculative_model: "meta-llama/Llama-3.2-1B-Instruct" +# num_speculative_tokens: 4 +# speculative_draft_tensor_parallel_size: 1 diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json new file mode 100644 index 0000000..7242234 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.07, + "baseline_delta": -0.53, + "valid": false, + "framework": "vllm-musa", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json new file mode 100644 index 0000000..4244ef7 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json @@ -0,0 +1,48 @@ +{ + "collected_at": "2026-05-18T09:21:31.092840+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json new file mode 100644 index 0000000..a050fe4 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json @@ -0,0 +1,164 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:21:31.092840+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 332.62, + "throughput_tokens_per_sec_per_chip": 332.62, + "throughput_tokens_per_sec_total": 922.83, + "elapsed_seconds_median": 43.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 331.64, + "throughput_tokens_per_sec_per_chip": 331.64, + "throughput_tokens_per_sec_total": 920.1, + "elapsed_seconds_median": 43.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 331.76, + "throughput_tokens_per_sec_per_chip": 331.76, + "throughput_tokens_per_sec_total": 920.46, + "elapsed_seconds_median": 43.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "17:34:52", + "run_id": "cabb7bd0", + "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00", + "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00", + "benchmark_elapsed_minutes": 8.7, + "model_load_seconds": 116.8 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json new file mode 100644 index 0000000..064d6b8 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online/result.json @@ -0,0 +1,163 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:21:31.092840+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 5, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 194.45, + "ttft_ms_p90": 315.05, + "ttft_ms_p99": 424.55, + "tpot_ms_p50": 201.93, + "tpot_ms_p90": 253.8, + "tpot_ms_p99": 471.28, + "elapsed_seconds_median": 137.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 4796.14, + "ttft_ms_p90": 8459.18, + "ttft_ms_p99": 9348.86, + "tpot_ms_p50": 355.01, + "tpot_ms_p90": 6430.04, + "tpot_ms_p99": 15579.83, + "elapsed_seconds_median": 93.0, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 10354.27, + "ttft_ms_p90": 17651.16, + "ttft_ms_p99": 19078.89, + "tpot_ms_p50": 849.82, + "tpot_ms_p90": 8677.79, + "tpot_ms_p99": 14281.03, + "elapsed_seconds_median": 90.0, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "17:53:38", + "run_id": "cabb7bd0", + "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T09:37:13.745117+00:00", + "benchmark_end_time": "2026-05-18T09:53:38.865501+00:00", + "benchmark_elapsed_minutes": 16.4, + "model_load_seconds": 122.7 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json new file mode 100644 index 0000000..e4b1093 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/result.json @@ -0,0 +1,215 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:21:31.092840+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 332.62, + "throughput_tokens_per_sec_per_chip": 332.62, + "throughput_tokens_per_sec_total": 922.83, + "elapsed_seconds_median": 43.4, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 331.64, + "throughput_tokens_per_sec_per_chip": 331.64, + "throughput_tokens_per_sec_total": 920.1, + "elapsed_seconds_median": 43.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 331.76, + "throughput_tokens_per_sec_per_chip": 331.76, + "throughput_tokens_per_sec_total": 920.46, + "elapsed_seconds_median": 43.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 5, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 194.45, + "ttft_ms_p90": 315.05, + "ttft_ms_p99": 424.55, + "tpot_ms_p50": 201.93, + "tpot_ms_p90": 253.8, + "tpot_ms_p99": 471.28, + "elapsed_seconds_median": 137.6, + "sla_met": true + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 4796.14, + "ttft_ms_p90": 8459.18, + "ttft_ms_p99": 9348.86, + "tpot_ms_p50": 355.01, + "tpot_ms_p90": 6430.04, + "tpot_ms_p99": 15579.83, + "elapsed_seconds_median": 93.0, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 10354.27, + "ttft_ms_p90": 17651.16, + "ttft_ms_p99": 19078.89, + "tpot_ms_p50": 849.82, + "tpot_ms_p90": 8677.79, + "tpot_ms_p99": 14281.03, + "elapsed_seconds_median": 90.0, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": 0.07, + "baseline_delta": -0.53, + "valid": false, + "framework": "vllm-musa", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "17:34:52", + "run_id": "cabb7bd0", + "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": "Partial run: ['offline', 'online'] succeeded, ['accuracy'] failed.", + "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00", + "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00", + "benchmark_elapsed_minutes": 25.1, + "model_load_seconds": 116.8, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.", + "scenario_dirs": { + "offline": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline", + "online": "results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/online" + } + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json new file mode 100644 index 0000000..63c6e92 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.07, + "baseline_delta": -0.31, + "valid": false, + "framework": "vllm-musa", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json new file mode 100644 index 0000000..31f501b --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/env_info.json @@ -0,0 +1,48 @@ +{ + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json new file mode 100644 index 0000000..4f5ff81 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive/result.json @@ -0,0 +1,131 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 25.89, + "ttft_ms_p90": 27.18, + "ttft_ms_p99": 28.51, + "tpot_ms_p50": 14.85, + "tpot_ms_p90": 15.17, + "tpot_ms_p99": 15.5, + "peak_memory_gb": null, + "elapsed_seconds_median": 481.4 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "17:21:09", + "run_id": "4f66d29d", + "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T08:56:46.686185+00:00", + "benchmark_end_time": "2026-05-18T09:21:09.800661+00:00", + "benchmark_elapsed_minutes": 24.4, + "model_load_seconds": 151.2 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json new file mode 100644 index 0000000..2498167 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline/result.json @@ -0,0 +1,164 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1994.51, + "throughput_tokens_per_sec_per_chip": 1994.51, + "throughput_tokens_per_sec_total": 3642.41, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1998.44, + "throughput_tokens_per_sec_per_chip": 1998.44, + "throughput_tokens_per_sec_total": 3649.59, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2004.02, + "throughput_tokens_per_sec_per_chip": 2004.02, + "throughput_tokens_per_sec_total": 3659.77, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "16:48:27", + "run_id": "4f66d29d", + "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00", + "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00", + "benchmark_elapsed_minutes": 2.5, + "model_load_seconds": 146.8 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json new file mode 100644 index 0000000..eb13372 --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online/result.json @@ -0,0 +1,151 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": null, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 47.68, + "ttft_ms_p90": 96.31, + "ttft_ms_p99": 956.22, + "tpot_ms_p50": 47.25, + "tpot_ms_p90": 80.82, + "tpot_ms_p99": 131.63, + "elapsed_seconds_median": 37.8, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 94.5, + "ttft_ms_p90": 194.64, + "ttft_ms_p99": 331.88, + "tpot_ms_p50": 74.76, + "tpot_ms_p90": 287.01, + "tpot_ms_p99": 444.19, + "elapsed_seconds_median": 19.0, + "sla_met": true + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "16:53:54", + "run_id": "4f66d29d", + "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T08:51:01.188901+00:00", + "benchmark_end_time": "2026-05-18T08:53:54.250762+00:00", + "benchmark_elapsed_minutes": 2.9, + "model_load_seconds": 132.6 + } +} \ No newline at end of file diff --git a/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json new file mode 100644 index 0000000..a1c073d --- /dev/null +++ b/results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/result.json @@ -0,0 +1,215 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "moorethreads_vllm_musa_f2f6f965", + "chip": { + "name": "MTT S4000", + "vendor": "Moore Threads", + "count": 1, + "memory_gb": 48.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T08:40:55.208034+00:00", + "accelerators": [ + { + "index": 0, + "name": "MTT S4000", + "vendor": "Moore Threads", + "memory_gb": 48.0, + "driver_version": "2.7.0", + "firmware_version": null, + "supports_bf16": true + } + ], + "accelerator_platform": "moorethreads", + "accelerator_topology": null, + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6430", + "physical_cores": 64, + "logical_cores": 128, + "numa_nodes": 2 + }, + "system_memory_gb": 1007.5, + "pcie_generation": "PCIe 16x/16x", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": [ + { + "name": "mlx5_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_1", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + }, + { + "name": "mlx5_bond_0", + "type": "InfiniBand/RoCE", + "bandwidth_gbps": null + } + ], + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8", + "kernel_version": "5.15.0-105-generic", + "runtime_version": "Moore Threads Driver 2.7.0", + "pytorch_version": "2.2.0" + }, + "software": { + "framework": "vllm-musa", + "framework_version": "0.4.2", + "driver_version": "2.7.0", + "runtime_version": "Moore Threads Driver 2.7.0", + "os": "Ubuntu Jammy Jellyfish (development branch)", + "python_version": "3.10.8" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "BF16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": null + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 1994.51, + "throughput_tokens_per_sec_per_chip": 1994.51, + "throughput_tokens_per_sec_total": 3642.41, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 1998.44, + "throughput_tokens_per_sec_per_chip": 1998.44, + "throughput_tokens_per_sec_total": 3649.59, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 2004.02, + "throughput_tokens_per_sec_per_chip": 2004.02, + "throughput_tokens_per_sec_total": 3659.77, + "elapsed_seconds_median": 12.5, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 40, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 47.68, + "ttft_ms_p90": 96.31, + "ttft_ms_p99": 956.22, + "tpot_ms_p50": 47.25, + "tpot_ms_p90": 80.82, + "tpot_ms_p99": 131.63, + "elapsed_seconds_median": 37.8, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 94.5, + "ttft_ms_p90": 194.64, + "ttft_ms_p99": 331.88, + "tpot_ms_p50": 74.76, + "tpot_ms_p90": 287.01, + "tpot_ms_p99": 444.19, + "elapsed_seconds_median": 19.0, + "sla_met": true + } + ] + }, + "interactive": { + "ttft_ms_p50": 25.89, + "ttft_ms_p90": 27.18, + "ttft_ms_p99": 28.51, + "tpot_ms_p50": 14.85, + "tpot_ms_p90": 15.17, + "tpot_ms_p99": 15.5, + "peak_memory_gb": null, + "elapsed_seconds_median": 481.4 + } + }, + "accuracy": { + "subset_score": 0.07, + "baseline_delta": -0.31, + "valid": false, + "framework": "vllm-musa", + "precision": "BF16", + "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "16:48:27", + "run_id": "4f66d29d", + "run_name": "mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d", + "flagged": null, + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": "Partial run: ['offline', 'online', 'interactive'] succeeded, ['accuracy'] failed.", + "benchmark_start_time": "2026-05-18T08:45:57.373367+00:00", + "benchmark_end_time": "2026-05-18T08:48:27.423209+00:00", + "benchmark_elapsed_minutes": 29.8, + "model_load_seconds": 146.8, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.", + "scenario_dirs": { + "offline": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/offline", + "online": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/online", + "interactive": "results/community/mtt_s4000x1_suite_F_moorethreads_vllm_musa_f2f6f965_4f66d29d/interactive" + } + } +} \ No newline at end of file diff --git a/runners/README.md b/runners/README.md index 95290aa..aaf4d81 100644 --- a/runners/README.md +++ b/runners/README.md @@ -252,7 +252,7 @@ nvidia_trtllm_fp8_8d2f1a4b amd_vllm_rocm_7b2e1d8f ascend_mindie_9c4a3f11 apple_mlx_b3e21f09 -moorethreads_vllm_musa_57ff5443 +moorethreads_vllm_musa_f2f6f965 ``` --- diff --git a/runners/moorethreads_vllm_musa_f2f6f965/README.md b/runners/moorethreads_vllm_musa_f2f6f965/README.md new file mode 100644 index 0000000..5111bdc --- /dev/null +++ b/runners/moorethreads_vllm_musa_f2f6f965/README.md @@ -0,0 +1,145 @@ +# moorethreads_vllm_musa_f2f6f965 — Moore Threads MUSA Runner (vllm-musa) + +AccelMark runner for Moore Threads MUSA GPUs using +[vllm-musa](https://github.com/MooreThreads/vllm-musa). + +## Supported suites + +| Suite | Description | Notes | +|-------|-------------|-------| +| Suite A | Single-chip, Llama-3-8B | Validated on S4000 (default: accuracy/offline/online) | +| Suite B | Multi-chip, Llama-3-70B | MCCL tensor parallelism; set `VLLM_WORKER_MULTIPROC_METHOD=spawn` | +| Suite C | Quantization, Llama-3.1-8B | FP8 skipped (not supported); W8A8/W8A16 via compressed-tensors | +| Suite D | Long context ~28K input, Llama-3.1-8B | Reduce `max_num_seqs` / `gpu_memory_utilization` in runner config | +| Suite E | Multi-chip scaling, Llama-3-8B | MCCL tensor parallelism | +| Suite F | Edge, Qwen2.5-0.5B | Validated on MTT S4000 (community result in repo) | +| Suite G | MoE multi-chip, Mixtral-8x7B | Unsupported | + +## Hardware compatibility + +| GPU | BF16 / FP16 | Multi-chip TP | FP8 | Notes | +|-----|-------------|---------------|-----|-------| +| MTT S4000 / S5000 | ✅ (BF16 → float16 on vLLM < 0.10) | ✅ (MCCL) | ❌ | Tested with vLLM 0.4.x+musa | +| MTT S3000 / S80 | ✅ | ✅ | ❌ | May need `--enforce-eager` on Triton errors | + +FP8 is excluded — not supported on this runner. FP32 inference fails with +FlashAttention on MUSA (use FP16 or BF16). Qwen3 requires a newer vLLM + MUSA port +(Qwen2.5 / Llama-3 work on 0.4.x). + +## Prerequisites + +Install in this order — **do not** `pip install torch` or `vllm` from PyPI on a +bare Linux host: + +**1. MUSA toolkit + driver** + + + +**2. vllm-musa (official build)** + +| Resource | URL | +|----------|-----| +| Repository | | +| Build guide | [README_vllm_musa.md](https://github.com/MooreThreads/vllm-musa/blob/main/README_vllm_musa.md) | +| PyTorch MUSA | | + +```bash +git clone https://github.com/MooreThreads/vllm-musa.git +cd vllm-musa +bash build_musa.sh +python -c "from vllm import LLM; print('vllm ok')" +``` + +**3. Runner dependencies** + +```bash +pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt +``` + +Pin `transformers` to **4.40–4.46** (not 5.x) when on vLLM 0.4.x. + +**Environment variables** + +```bash +export MUSA_VISIBLE_DEVICES=0 +export VLLM_WORKER_MULTIPROC_METHOD=spawn # when tensor_parallel_size > 1 +``` + +## Smoke test + +```bash +python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py +python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model +``` + +## Accuracy + +AccelMark runs an integrated MMLU subset after each benchmark using the **same** +vLLM instance as the perf run. The runner sets `device=musa`, dtype, and +tokenizer correctly; low scores on vLLM **0.4.x+musa** reflect broken generation +in that stack, not missing AccelMark wiring. + +| Model | Suite | Measured | Baseline | +|-------|-------|----------|----------| +| Qwen2.5-0.5B-Instruct | F | **~0.07** | 0.37 (FP16) / 0.38 (BF16) | +| Llama-3-8B-Instruct | A | **~0.07** | 0.60 (BF16) | + +Throughput completes normally; answers are effectively random (repetition, system +prompt regurgitation, similar ~7% across different models). + +While accuracy is broken on 0.4.x, use `--skip-accuracy-gate` to finish a perf run: + +```bash +python run.py --runner moorethreads_vllm_musa_f2f6f965 \ + --suite suite_F --precision FP16 --skip-accuracy-gate +``` + +Likely fix: upgrade to vllm-musa aligned with vLLM **0.10+**, keep +`transformers` 4.40–4.46 on legacy forks, then re-run without +`--skip-accuracy-gate`. + +## Usage + +```bash +python run.py --runner moorethreads_vllm_musa_f2f6f965 --suite suite_F --precision FP16 + +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +python run.py --runner moorethreads_vllm_musa_f2f6f965 \ + --suite suite_B --tensor-parallel-size 8 +``` + +Optional runner config (copy and edit): + +```bash +cp configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example \ + configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml +``` + +| Field | Default | Notes | +|-------|---------|-------| +| `tensor_parallel_size` | 1 | MCCL tensor parallelism | +| `enforce_eager` | false | Only if Triton / graph capture errors | +| `max_num_seqs` | 256 | Lower on small HBM | +| `gpu_memory_utilization` | 0.85 | Lower if OOM | + +## Troubleshooting + +| Symptom | Fix | +|---------|-----| +| `GLIBCXX_3.4.30` on import | Import `torch` before `transformers` (runner and smoke test do this) | +| `KeyError: 'type'` in rope_scaling | Pin `transformers==4.46.3` (not 5.x) | +| `Expected musa device, got cuda:0` | Use this runner (`device="musa"`) | +| MMLU ~0.07 | See [Accuracy](#accuracy); `--skip-accuracy-gate` for perf-only runs | +| OOM | Lower `gpu_memory_utilization` / `max_num_seqs` | +| Triton / graph errors | `--enforce-eager` or `enforce_eager: true` in runner YAML | + +## Requirements + +See `requirements.txt` for AccelMark extras. vLLM, torch_musa, and the MUSA +driver are installed per the official vllm-musa guide above (not from this file). + +Minimum environment: + +- Moore Threads GPU with MUSA driver +- Python 3.10+ +- vllm-musa build per [MooreThreads/vllm-musa](https://github.com/MooreThreads/vllm-musa) diff --git a/runners/moorethreads_vllm_musa_f2f6f965/meta.json b/runners/moorethreads_vllm_musa_f2f6f965/meta.json new file mode 100644 index 0000000..e57d72d --- /dev/null +++ b/runners/moorethreads_vllm_musa_f2f6f965/meta.json @@ -0,0 +1,21 @@ +{ + "id": "moorethreads_vllm_musa_f2f6f965", + "platform": "moorethreads", + "name": "vllm-musa on Moore Threads MUSA GPU", + "framework": "vllm-musa", + "submitted_by": "JuhaoLiang1997", + "description": "AccelMark runner for Moore Threads MUSA GPUs using vllm-musa (https://github.com/MooreThreads/vllm-musa). Install torch/vllm per upstream README_vllm_musa.md; requirements.txt adds benchmark deps only. Sets device=musa; BF16 maps to float16 on vLLM <0.10. MCCL tensor parallelism. FP8 unsupported.", + "supersedes_chain": [], + "notes": "Smoke-tested on MTT S4000 (vLLM 0.4.2+musa): Suite A and F default scenarios run. MMLU not at baseline — see runner README.", + "created": "2026-05-18", + "hardware_label": null, + "suite_support": { + "A": "validated", + "B": "pending", + "C": "pending", + "D": "pending", + "E": "pending", + "F": "validated", + "G": "unsupported" + } +} diff --git a/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt new file mode 100644 index 0000000..1fe16ee --- /dev/null +++ b/runners/moorethreads_vllm_musa_f2f6f965/requirements.txt @@ -0,0 +1,22 @@ +# AccelMark — moorethreads_vllm_musa_f2f6f965 +# +# AccelMark benchmark dependencies only. Install MUSA toolkit, torch_musa, and +# vllm-musa first — see README.md and https://github.com/MooreThreads/vllm-musa +# +# pip install -r runners/moorethreads_vllm_musa_f2f6f965/requirements.txt +# python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py + +# AccelMark / loadgen +numpy==1.26.4 +jsonschema==4.25.1 +psutil==7.1.0 +tqdm==4.67.1 +aiohttp==3.12.15 +PyYAML==6.0.2 + +# Tokenizer / config (pin to match vLLM 0.4.x — see README) +transformers>=4.43.0,<4.47.0 +tokenizers>=0.20.0,<0.21.0 +huggingface-hub>=0.26.0,<0.27.0 +accelerate>=1.2.0,<1.3.0 +safetensors>=0.4.5,<0.5.0 diff --git a/runners/moorethreads_vllm_musa_f2f6f965/runner.py b/runners/moorethreads_vllm_musa_f2f6f965/runner.py new file mode 100644 index 0000000..b693369 --- /dev/null +++ b/runners/moorethreads_vllm_musa_f2f6f965/runner.py @@ -0,0 +1,440 @@ +""" +AccelMark — Moore Threads MUSA vLLM benchmark runner (vllm-musa). + +Implements BenchmarkRunner for vllm-musa on Moore Threads MUSA GPUs. +See README.md in this folder for install and hardware notes. +""" + +import asyncio +import gc +import sys +import time +from pathlib import Path +from typing import Optional + +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(_REPO_ROOT)) + +from runners.benchmark_runner import BenchmarkRunner, InferenceRequest +from loadgen.types import InferenceResult + +import logging +logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING) +logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING) + + +class MoorethreadsVLLMMUSARunner(BenchmarkRunner): + """vLLM on Moore Threads MUSA via vllm-musa.""" + + SUPPORTS_STREAMING = True + SUPPORTS_BATCHING = True + SUPPORTS_ONLINE = True + SUPPORTS_MULTI_CHIP = True + + SUPPORTED_PRECISIONS = ["bf16", "fp16"] + SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors"] + + _musa_runtime_prepared = False + + def __init__(self): + self.llm = None + self.engine = None + self.tokenizer = None + self.sampling_params = None + self._loop: asyncio.AbstractEventLoop = None + + def _get_chip_count(self) -> int: + try: + import pymtml + pymtml.mtmlInit() + try: + n = pymtml.mtmlDeviceGetCount() + finally: + try: + pymtml.mtmlShutdown() + except Exception: + pass + if n and n > 0: + return int(n) + except Exception: + pass + try: + import torch + n = torch.cuda.device_count() + return n if n > 0 else 1 + except Exception: + return 1 + + def _get_framework_name(self) -> str: + return "vllm-musa" + + def _get_framework_version(self) -> str: + plugin_version = "unknown" + try: + from importlib.metadata import version + plugin_version = version("vllm-musa") + except Exception: + try: + import vllm_musa_platform # type: ignore + plugin_version = getattr(vllm_musa_platform, "__version__", "unknown") + except Exception: + pass + try: + import vllm + core_version = vllm.__version__ + except Exception: + core_version = "unknown" + if plugin_version == "unknown" and core_version == "unknown": + return "unknown" + if plugin_version == "unknown": + return core_version + return f"{plugin_version}+vllm-{core_version}" + + def get_model_format(self) -> str: + return "HuggingFace original" + + @classmethod + def _prepare_musa_runtime(cls) -> None: + if cls._musa_runtime_prepared: + return + import torch # noqa: F401 + cls._musa_runtime_prepared = True + + @staticmethod + def _legacy_vllm_musa() -> bool: + try: + import vllm + ver = vllm.__version__.split("+")[0] + major, minor = (int(x) for x in ver.split(".")[:2]) + return (major, minor) < (0, 10) + except Exception: + return True + + @staticmethod + def _get_engine_arg_fields() -> set[str]: + try: + import dataclasses + from vllm.engine.arg_utils import EngineArgs + return {f.name for f in dataclasses.fields(EngineArgs)} + except Exception: + return set() + + def _resolve_musa_dtype(self, dtype: str, precision: str) -> str: + if not self._legacy_vllm_musa(): + return dtype + if dtype in ("bfloat16", "auto") or precision.upper() == "BF16": + if dtype != "float16": + print(" Note: vLLM 0.4.x+musa — using float16") + return "float16" + return dtype + + def load_model(self, model_path: str, parallelism: dict) -> None: + self._prepare_musa_runtime() + + from transformers import AutoTokenizer + from vllm import LLM, AsyncLLMEngine, SamplingParams + from vllm.engine.arg_utils import AsyncEngineArgs + + tp_size = parallelism["tensor_parallel_size"] + pp_size = parallelism["pipeline_parallel_size"] + ep_size = parallelism.get("expert_parallel_size", 1) + assert pp_size <= 1, ( + "Pipeline parallelism is not supported. Use --tensor-parallel-size." + ) + + max_tokens = parallelism["max_tokens"] + max_model_len = parallelism["max_model_len"] + use_async = parallelism["use_async"] + enforce_eager = getattr(self, "_enforce_eager", False) + + cfg = getattr(self, "_runner_config", {}) + max_num_seqs = cfg.get("max_num_seqs", 256) + musa_memory_util = cfg.get("gpu_memory_utilization", 0.85) + extra_kwargs = dict(cfg.get("engine_kwargs") or {}) + + _valid_engine_fields = self._get_engine_arg_fields() + if _valid_engine_fields: + _dropped = {k: v for k, v in extra_kwargs.items() + if k not in _valid_engine_fields} + if _dropped: + print(f" Warning: engine_kwargs keys not supported by this " + f"vllm-musa / vLLM version and will be ignored: " + f"{list(_dropped)}") + extra_kwargs = {k: v for k, v in extra_kwargs.items() + if k in _valid_engine_fields} + + effective_precision = getattr(self, "_effective_precision", "BF16").upper() + precision = getattr(self, "_precision", None) or effective_precision + _dtype_override = getattr(self, "_precision_dtype_override", None) + _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {}) + quantization = _prec_eng_kwargs.pop("quantization", None) + + _NATIVE_DTYPE_MAP = {"BF16": "bfloat16", "FP16": "float16", "FP32": "float32"} + dtype = _NATIVE_DTYPE_MAP.get(precision, "auto") + self._quantization_method = quantization + + if _dtype_override: + dtype = _dtype_override + dtype = self._resolve_musa_dtype(dtype, precision) + if _prec_eng_kwargs: + _prec_eng_kwargs.update(extra_kwargs) + extra_kwargs = _prec_eng_kwargs + + if "speculative_model" in extra_kwargs and "speculative_config" not in extra_kwargs: + extra_kwargs["speculative_config"] = { + "model": extra_kwargs.pop("speculative_model"), + "num_speculative_tokens": extra_kwargs.pop("num_speculative_tokens", 4), + "draft_tensor_parallel_size": extra_kwargs.pop( + "speculative_draft_tensor_parallel_size", 1 + ), + } + + print( + f"Loading model: precision={precision}, dtype={dtype}" + + (f", quantization_method={self._quantization_method}" + if self._quantization_method else "") + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=False + ) + self.sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) + + base_kwargs = dict( + model=model_path, + dtype=dtype, + tensor_parallel_size=tp_size, + trust_remote_code=False, + enforce_eager=enforce_eager, + ) + if not _valid_engine_fields or "device" in _valid_engine_fields: + base_kwargs["device"] = "musa" + if ep_size > 1: + base_kwargs["enable_expert_parallel"] = True + if quantization: + base_kwargs["quantization"] = quantization + if max_model_len: + base_kwargs["max_model_len"] = max_model_len + + if not use_async: + self.llm = LLM(**{ + **base_kwargs, + "max_num_seqs": max_num_seqs, + "gpu_memory_utilization": musa_memory_util, + **extra_kwargs, + }) + else: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + engine_args = AsyncEngineArgs(**{ + **base_kwargs, + "gpu_memory_utilization": musa_memory_util, + **extra_kwargs, + }) + self.engine = AsyncLLMEngine.from_engine_args(engine_args) + + def get_effective_dtype(self) -> Optional[str]: + try: + if self.llm is not None: + return str(self.llm.llm_engine.model_config.dtype).replace("torch.", "") + if self.engine is not None: + return str(self.engine.engine.model_config.dtype).replace("torch.", "") + except Exception: + pass + return getattr(self, "_effective_dtype", None) + + def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]: + formatted = [self._format_prompt(r.prompt) for r in requests] + t_start = time.perf_counter() + outputs = self.llm.generate(formatted, self.sampling_params) + elapsed = time.perf_counter() - t_start + + self._last_accuracy_outputs = [o.outputs[0].text for o in outputs] + + return [ + InferenceResult( + first_token_time_ms=None, + total_time_ms=elapsed * 1000, + output_tokens=len(o.outputs[0].token_ids), + input_tokens=len(o.prompt_token_ids), + success=True, + output_text=o.outputs[0].text, + ) + for o in outputs + ] + + async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult: + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + t_start = time.perf_counter() + first_token_time_ms = None + output_tokens = 0 + output_text = "" + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + if first_token_time_ms is None and len(output.outputs[0].token_ids) > 0: + first_token_time_ms = (time.perf_counter() - t_start) * 1000 + output_tokens = len(output.outputs[0].token_ids) + output_text = output.outputs[0].text + + return InferenceResult( + first_token_time_ms=first_token_time_ms, + total_time_ms=(time.perf_counter() - t_start) * 1000, + output_tokens=output_tokens, + input_tokens=0, + success=True, + output_text=output_text, + ) + + async def inference_fn_token_stream(self, request: InferenceRequest): + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + prev_length = 0 + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + current_text = output.outputs[0].text + delta = current_text[prev_length:] + if delta: + yield delta + prev_length = len(current_text) + + def get_peak_memory_gb(self) -> Optional[float]: + try: + import torch + return torch.cuda.max_memory_allocated() / (1024 ** 3) + except Exception: + pass + try: + import pymtml + pymtml.mtmlInit() + try: + dev = pymtml.mtmlDeviceGetByIndex(0) + info = pymtml.mtmlDeviceGetMemoryInfo(dev) + used = getattr(info, "used", None) + if used is not None: + return float(used) / (1024 ** 3) + finally: + try: + pymtml.mtmlShutdown() + except Exception: + pass + except Exception: + pass + return None + + def release_resources(self) -> None: + if self.llm is not None: + try: + del self.llm + except Exception: + pass + self.llm = None + + if self.engine is not None: + try: + if self._loop and not self._loop.is_closed(): + self._loop.run_until_complete(self.engine.shutdown()) + except Exception: + pass + try: + del self.engine + except Exception: + pass + self.engine = None + + try: + from vllm.distributed.parallel_state import cleanup_dist_env_and_memory + cleanup_dist_env_and_memory(shutdown_ray=False) + except Exception: + try: + from vllm.distributed.parallel_state import ( + destroy_model_parallel, + destroy_distributed_environment, + ) + destroy_model_parallel() + destroy_distributed_environment() + except Exception: + pass + + try: + import torch + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + except Exception: + pass + + gc.collect() + + try: + import torch + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + except Exception: + pass + + def parse_args(self): + """Add vllm-musa-specific CLI flags. Base class pre-loads runner config.""" + args = super().parse_args() + cfg = self._runner_config + + import argparse + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument("--tensor-parallel-size", type=int, default=None, + dest="tensor_parallel_size") + parser.add_argument("--expert-parallel-size", type=int, default=None, + dest="expert_parallel_size") + parser.add_argument("--enforce-eager", action="store_true", default=False, + dest="enforce_eager") + extra, _ = parser.parse_known_args() + + tp_size, _tp_source = self._resolve_tensor_parallel_size( + extra.tensor_parallel_size + ) + ep_size = (extra.expert_parallel_size + if extra.expert_parallel_size is not None + else cfg.get("expert_parallel_size", 1)) + self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False) + + print(f" tensor_parallel_size = {tp_size} [{_tp_source}]") + if ep_size > 1: + print(f" expert_parallel_size = {ep_size} [cli/yaml]") + + if not self.SUPPORTS_MULTI_CHIP and tp_size > 1: + print(f"Warning: {self.__class__.__name__} does not support multi-chip. " + f"Ignoring tensor_parallel_size={tp_size}, using 1.") + tp_size = 1 + ep_size = 1 + + self._parallelism = { + "tensor_parallel_size": tp_size, + "pipeline_parallel_size": 1, + "expert_parallel_size": ep_size, + "data_parallel_size": 1, + } + self._chip_count = tp_size + self._precision = getattr(args, "precision", None) + return args + + def get_extra_subprocess_args(self, args) -> list[str]: + extra = [ + "--tensor-parallel-size", + str(self._parallelism.get("tensor_parallel_size", 1)), + ] + if self._parallelism.get("expert_parallel_size", 1) > 1: + extra += ["--expert-parallel-size", + str(self._parallelism["expert_parallel_size"])] + if self._enforce_eager: + extra += ["--enforce-eager"] + return extra + + +if __name__ == "__main__": + MoorethreadsVLLMMUSARunner().main() diff --git a/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py new file mode 100644 index 0000000..86cbbf9 --- /dev/null +++ b/runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +""" +Standalone vllm-musa smoke test (does not use the AccelMark runner). + +Usage (from repo root): + + python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py + python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py /path/to/model + + MODEL_PATH=/path/to/Qwen2.5-0.5B-Instruct \\ + python runners/moorethreads_vllm_musa_f2f6f965/test_smoke.py +""" + +from __future__ import annotations + +import gc +import os +import sys +import time + +import torch # noqa: F401 — before transformers/vllm (libstdc++ load order) + +from vllm import LLM, SamplingParams + +_DEFAULT_MODEL = os.getenv("MODEL_PATH", "Qwen/Qwen2.5-0.5B-Instruct") + +PROMPTS = [ + "The capital of France is", + "Say hello in one short sentence.", +] + + +def main() -> int: + model_path = sys.argv[1] if len(sys.argv) > 1 else _DEFAULT_MODEL + + sampling_params = SamplingParams(temperature=0.0, max_tokens=64) + + print(f"Loading {model_path} ...") + t_load = time.perf_counter() + llm = LLM( + model=model_path, + device="musa", + dtype="float16", + tensor_parallel_size=1, + max_model_len=1024, + max_num_seqs=4, + gpu_memory_utilization=0.85, + trust_remote_code=False, + ) + print(f"Model loaded in {time.perf_counter() - t_load:.1f}s\n") + + t_infer = time.perf_counter() + outputs = llm.generate(PROMPTS, sampling_params) + print(f"Inference done in {time.perf_counter() - t_infer:.1f}s\n") + + for prompt, output in zip(PROMPTS, outputs): + text = output.outputs[0].text + n_tokens = len(output.outputs[0].token_ids) + print(f"Prompt: {prompt!r}") + print(f"Output: {text!r}") + print(f"Tokens: {n_tokens}\n") + + del llm + gc.collect() + try: + if hasattr(torch, "musa"): + torch.musa.empty_cache() + else: + torch.cuda.empty_cache() + except Exception: + pass + print("Done.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/runners/platforms/moorethreads.py b/runners/platforms/moorethreads.py index 708db1b..9f55684 100644 --- a/runners/platforms/moorethreads.py +++ b/runners/platforms/moorethreads.py @@ -1,17 +1,13 @@ """Moore Threads MUSA GPU platform plug-in. -Moore Threads ships its own driver and management tooling: - -* ``mthreads-gmi`` — the moral equivalent of ``nvidia-smi`` / ``rocm-smi``. -* ``pymtml`` — Python bindings analogous to NVML / pynvml. -* ``torchada`` — a CUDA→MUSA compatibility shim that exposes the standard - ``torch.cuda`` API, with the real backend version available via - ``torch.version.musa``. - -This plug-in first tries the Python bindings (best machine-readable -output) and falls back to scraping ``mthreads-gmi`` text output. Both -paths are best-effort: when none of the tools are installed the plug-in -silently reports zero accelerators and the collector moves on. +Used by ``runners/collect_env.py`` to populate ``env_info.json``. + +Detection order (first non-empty wins): + + 1. ``pymtml`` (mthreads-ml-py) — same API as used in the vllm-musa runner + 2. ``mthreads-gmi`` text output + 3. ``torch`` device properties (``torch.cuda`` aliased to MUSA via torchada, + or native ``torch.musa`` when available) """ from __future__ import annotations @@ -23,8 +19,6 @@ VENDOR_LABEL = "Moore Threads" PRIORITY = 60 -# S5000 / S4000 datacenter SKUs ship with native BF16 support; the older -# consumer-class MTT S80/S70 cards are FP16-only. _BF16_SUPPORTED_HINTS = ("s5000", "s4000", "s3000") _NO_BF16_HINTS = ("s80", "s70", "s60", "s50") @@ -40,50 +34,68 @@ def _supports_bf16(chip_name: str) -> bool: return True +def _driver_version_from_smi() -> str | None: + try: + out = subprocess.check_output( + ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL + ) + m = re.search(r"Driver\s+Version\s*:\s*(\S+)", out, re.IGNORECASE) + if m: + return m.group(1) + except Exception: + pass + return None + + def _collect_via_pymtml() -> list[dict]: try: - import pymtml as mtml # type: ignore[import-not-found] + import pymtml except ImportError: return [] try: - mtml.mtmlInit() + pymtml.mtmlInit() except Exception: return [] + driver = _driver_version_from_smi() or "unknown" accelerators: list[dict] = [] try: - count = mtml.mtmlDeviceGetCount() + count = pymtml.mtmlDeviceGetCount() except Exception: try: - mtml.mtmlShutdown() + pymtml.mtmlShutdown() except Exception: pass return [] for idx in range(int(count)): try: - handle = mtml.mtmlDeviceGetHandleByIndex(idx) - name = mtml.mtmlDeviceGetName(handle) - mem = mtml.mtmlDeviceGetMemoryInfo(handle) - total_mb = getattr(mem, "total", None) or mem.get("total", 0) - driver = mtml.mtmlSystemGetDriverVersion() + dev = pymtml.mtmlDeviceGetByIndex(idx) + name = pymtml.mtmlDeviceGetName(dev) + mem = pymtml.mtmlDeviceGetMemoryInfo(dev) + total_bytes = getattr(mem, "total", None) + if total_bytes is None and isinstance(mem, dict): + total_bytes = mem.get("total") except Exception: continue + if not isinstance(name, str): + name = name.decode("utf-8", "ignore") + memory_gb = round(int(total_bytes) / (1024 ** 3), 1) if total_bytes else None accelerators.append( { "index": idx, - "name": name if isinstance(name, str) else name.decode("utf-8", "ignore"), + "name": name, "vendor": VENDOR_LABEL, - "memory_gb": round(int(total_mb) / 1024, 1) if total_mb else None, - "driver_version": driver if isinstance(driver, str) else driver.decode("utf-8", "ignore"), + "memory_gb": memory_gb, + "driver_version": driver, "firmware_version": None, - "supports_bf16": _supports_bf16(str(name)), + "supports_bf16": _supports_bf16(name), } ) try: - mtml.mtmlShutdown() + pymtml.mtmlShutdown() except Exception: pass @@ -91,12 +103,7 @@ def _collect_via_pymtml() -> list[dict]: def _collect_via_smi() -> list[dict]: - """Fallback parser for ``mthreads-gmi`` text output. - - The output format mirrors nvidia-smi: a header with the driver / MUSA - versions followed by per-device blocks listing the product name and - memory usage. We only need the device name and total memory. - """ + """Parse ``mthreads-gmi`` text output (mthreads-gmi 1.14+ tabular format).""" try: out = subprocess.check_output( ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL @@ -110,21 +117,18 @@ def _collect_via_smi() -> list[dict]: driver = m.group(1) accelerators: list[dict] = [] - # Per-device rows look like: - # | 0 MTT S4000 ... | 0000:65:00.0 Off | ... | - # followed by: - # | 0% 45C P0 ... / ... | 234MiB / 49152MiB | ... | + # Example row: + # 0 MTT S4000 |00000000:28:00.0 |0% 4MiB(49152MiB) for match in re.finditer( - r"\|\s*(\d+)\s+(MTT\s+\S+(?:\s+\S+)?)\s*", out + r"^(\d+)\s+(MTT\s+\S+)\s+\|", + out, + re.MULTILINE, ): idx = int(match.group(1)) name = match.group(2).strip() - # Search downstream of this match for the memory line - tail = out[match.end():] - mem_match = re.search(r"(\d+)MiB\s*/\s*(\d+)MiB", tail) - memory_gb = None - if mem_match: - memory_gb = round(int(mem_match.group(2)) / 1024, 1) + tail = out[match.end(): match.end() + 256] + mem_match = re.search(r"\d+MiB\((\d+)MiB\)", tail) + memory_gb = round(int(mem_match.group(1)) / 1024, 1) if mem_match else None accelerators.append( { "index": idx, @@ -139,23 +143,69 @@ def _collect_via_smi() -> list[dict]: return accelerators +def _collect_via_torch() -> list[dict]: + """Fallback when management libraries are missing but torch MUSA is loaded.""" + try: + import torch + except ImportError: + return [] + + driver = _driver_version_from_smi() or "unknown" + accelerators: list[dict] = [] + + if hasattr(torch, "musa"): + try: + count = torch.musa.device_count() + get_props = torch.musa.get_device_properties + except Exception: + count = 0 + get_props = None + else: + try: + count = torch.cuda.device_count() + get_props = torch.cuda.get_device_properties + except Exception: + return [] + + for idx in range(int(count)): + try: + props = get_props(idx) + name = getattr(props, "name", None) or f"MTT GPU {idx}" + total = getattr(props, "total_memory", None) + memory_gb = round(total / (1024 ** 3), 1) if total else None + except Exception: + continue + accelerators.append( + { + "index": idx, + "name": name if isinstance(name, str) else str(name), + "vendor": VENDOR_LABEL, + "memory_gb": memory_gb, + "driver_version": driver, + "firmware_version": None, + "supports_bf16": _supports_bf16(str(name)), + } + ) + return accelerators + + def collect() -> list[dict]: - accelerators = _collect_via_pymtml() - if accelerators: - return accelerators - return _collect_via_smi() + for fn in (_collect_via_pymtml, _collect_via_smi, _collect_via_torch): + accelerators = fn() + if accelerators: + return accelerators + return [] def detect_runtime_version() -> str | None: - """Prefer torch.version.musa (most reliable when torchada is installed), - fall back to scraping ``mthreads-gmi`` header. - """ try: import torch ver = getattr(torch.version, "musa", None) if ver: return f"MUSA {ver}" + if getattr(torch.version, "cuda", None): + return f"MUSA (torch.cuda shim) {torch.version.cuda}" except ImportError: pass @@ -174,17 +224,43 @@ def detect_runtime_version() -> str | None: return None +def detect_pcie_gen() -> str | None: + try: + out = subprocess.check_output( + ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL + ) + m = re.search(r"\|\s*(\d+)x\((\d+)x\)\s*\|", out) + if m: + return f"PCIe {m.group(1)}x/{m.group(2)}x" + except Exception: + pass + return None + + +def detect_intra_node_interconnect() -> str | None: + """Moore Threads multi-GPU hosts typically use MCCL over PCIe.""" + accels = collect() + if len(accels) > 1: + return "MCCL/PCIe" + return None + + def diagnostics(env: dict, accelerators: list[dict]) -> list[str]: notes: list[str] = [] - if accelerators and (env.get("pytorch_version") or "") == "unknown": + if not accelerators: + notes.append( + "No Moore Threads MUSA GPUs detected (tried pymtml, mthreads-gmi, " + "and torch). Install the MUSA driver/toolkit per " + "https://github.com/MooreThreads/vllm-musa ." + ) + return notes + if (env.get("pytorch_version") or "") == "unknown": notes.append( - "PyTorch (with the torchada MUSA shim) is not installed — " - "pytorch_version is unknown." + "PyTorch with MUSA support is not installed — pytorch_version is unknown." ) - if accelerators and (env.get("runtime_version") or "") == "unknown": + if (env.get("runtime_version") or "") == "unknown": notes.append( "Could not detect MUSA runtime (tried torch.version.musa and " - "mthreads-gmi). runtime_version is unknown — install torchada " - "or the Moore Threads MUSA toolkit." + "mthreads-gmi). runtime_version is unknown." ) return notes diff --git a/schema/env.schema.json b/schema/env.schema.json index 60fc5e8..e80cd94 100644 --- a/schema/env.schema.json +++ b/schema/env.schema.json @@ -16,7 +16,7 @@ "properties": { "index": { "type": "integer" }, "name": { "type": "string" }, - "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple'" }, + "vendor": { "type": "string", "description": "Chip vendor, e.g. 'NVIDIA', 'AMD', 'Huawei', 'Apple', 'Moore Threads'" }, "memory_gb": { "type": ["number","null"], "minimum": 0 }, "driver_version": { "type": "string" }, "firmware_version": { "type": ["string","null"] }, diff --git a/schema/result.schema.json b/schema/result.schema.json index 99a0517..fb81a8a 100644 --- a/schema/result.schema.json +++ b/schema/result.schema.json @@ -36,7 +36,7 @@ "vendor": { "type": "string", "enum": ["NVIDIA","AMD","Intel","Google","Huawei","Cambricon","Biren", - "Enflame","MetaX","Iluvatar","Apple","Qualcomm","Other"] + "Enflame","MetaX","Moore Threads","Iluvatar","Apple","Qualcomm","Other"] }, "count": { "type": "integer", "minimum": 1 }, "memory_gb": { "type": "number", "minimum": 0 },