SemiAnalysisAI · arygupt · May 29, 2026 · cursor · May 29, 2026 · claude
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
@@ -67,6 +67,31 @@
             echo "=== Slurm job stderr ==="
             tail -100 "$err_file"
             echo "========================"
+            # Surface the real failure class in the Actions UI. Without this, a
+            # launch failure shows only the generic "No benchmark result files
+            # found" from benchmark-multinode-tmpl.yml. Order matters: check the
+            # deterministic recipe error (model-not-found, #1581) before the
+            # transport-flake patterns (#1584 MoRI/readiness) so a config bug is
+            # never mislabeled as a flake.
+            if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+                local sig=""
+                if   grep -qiE "Model '.*' not found|FATAL: Model|model .* not found" "$err_file"; then
+                    sig="recipe-error: model not found (deterministic - check MODEL/MODEL_PATH, not MoRI)"
+                elif grep -qiE "ReadTimeout|readiness.*timeout|warmup.*time(d)? ?out|health.*timeout" "$err_file"; then
+                    sig="transport-flake: readiness/warmup timeout (MoRI pd-disagg)"
+                elif grep -qiE "Fp8BlockwiseQuant.*IntraNode|dispatch_combine|combine.*IntraNode" "$err_file"; then
+                    sig="config-error: MoRI fp8_blockwise combine needs IntraNode (disable TBO/SDMA on FP4 prefill, #1584)"
+                elif grep -qiE "MoRI|mori_conn|pd[- ]?disagg" "$err_file"; then
+                    sig="transport-flake: MoRI KV-transport error"
+                elif grep -qiE "segfault|Segmentation fault|signal 11|core dumped|gpucore" "$err_file"; then
+                    sig="transport-flake: server segfault / core dump"
+                fi
+                if [[ -n "$sig" ]]; then
+                    echo "::error title=AMD disagg job ${JOB_ID:-unknown} failed::${sig} (see slurm .err artifact)"
+                else
+                    echo "::error title=AMD disagg job ${JOB_ID:-unknown} failed::Unclassified failure - see last 100 lines of slurm .err above"
+                fi
+            fi
         fi
         sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
     }

diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py
@@ -879,6 +879,21 @@ def main(args: argparse.Namespace):
             lora_modules=args.lora_modules,
         ))
 
+    # Gate the run BEFORE writing any result file. A sub-threshold (or
+    # zero-completion) run must not leave a schema-valid JSON on disk:
+    # downstream collectors (launch_mi355x-amds.sh, benchmark-multinode-tmpl.yml)
+    # treat file *existence* as success, so a written-then-failed file looks
+    # successful. Raising here keeps disk state consistent with the exit code.
+    max_failure_rate = 0.05
+    completed = benchmark_result["completed"]
+    failure_rate = 1 - completed / args.num_prompts
+    if failure_rate > max_failure_rate:
+        raise SystemExit(
+            f"FAIL: request failure rate {failure_rate:.1%} exceeds "
+            f"{max_failure_rate:.0%} threshold "
+            f"({completed}/{args.num_prompts} completed)"
+        )
+
     # Save config and results to json
     if args.save_result:
         result_json: Dict[str, Any] = {}
@@ -940,16 +955,6 @@ def main(args: argparse.Namespace):
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
 
-    max_failure_rate = 0.05
-    completed = benchmark_result["completed"]
-    failure_rate = 1 - completed / args.num_prompts
-    if failure_rate > max_failure_rate:
-        raise SystemExit(
-            f"FAIL: request failure rate {failure_rate:.1%} exceeds "
-            f"{max_failure_rate:.0%} threshold "
-            f"({completed}/{args.num_prompts} completed)"
-        )
-
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(