Skip to content

Commit a08ee95

Browse files
[Bench] Add stats for benchmark executions in GHA summary
1 parent 431a278 commit a08ee95

File tree

4 files changed

+132
-20
lines changed

4 files changed

+132
-20
lines changed

devops/actions/run-tests/benchmark/action.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,9 @@ runs:
231231
WORKDIR="$(realpath ./llvm_test_workdir)"
232232
if [ -n "$WORKDIR" ] && [ -d "$WORKDIR" ] && [[ "$WORKDIR" == *llvm_test_workdir* ]]; then rm -rf "$WORKDIR" ; fi
233233

234+
# Clean up potentially existing, old summary file
235+
[ -f "github_summary.md" ] && rm github_summary.md
236+
234237
numactl --cpunodebind "$NUMA_NODE" --membind "$NUMA_NODE" \
235238
./devops/scripts/benchmarks/main.py "$WORKDIR" \
236239
--sycl "$(realpath ./toolchain)" \
@@ -243,6 +246,7 @@ runs:
243246
--preset "$PRESET" \
244247
--timestamp-override "$SAVE_TIMESTAMP" \
245248
--detect-version sycl,compute_runtime \
249+
--produce-github-summary \
246250
${{ inputs.exit_on_failure == 'true' && '--exit-on-failure --iterations 1' || '' }}
247251
# TODO: add back: "--flamegraph inclusive" once works properly
248252

devops/scripts/benchmarks/compare.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,8 @@ def to_hist(
357357
parser_avg.add_argument(
358358
"--produce-github-summary",
359359
action="store_true",
360-
help=f"Create a summary file '{options.github_summary_filename}' for Github workflow summaries.",
360+
help=f"Produce regression summary for Github workflow, in file '{options.github_summary_filename}'.",
361+
default=False,
361362
)
362363

363364
args = parser.parse_args()
@@ -473,14 +474,16 @@ def print_regression(entry: dict, is_warning: bool = False):
473474

474475
if not args.dry_run:
475476
if args.produce_github_summary:
476-
with open(options.github_summary_filename, "w") as f:
477+
with open(options.github_summary_filename, "a") as f:
477478
f.write("\n".join(gh_summary))
478479
exit(1) # Exit 1 to trigger Github test failure
479480

480481
log.info("No unexpected regressions found!")
481482
if args.produce_github_summary:
483+
gh_summary.append("")
484+
gh_summary.append("### Regressions")
482485
gh_summary.append("No unexpected regressions found!")
483-
with open(options.github_summary_filename, "w") as f:
486+
with open(options.github_summary_filename, "a") as f:
484487
f.write("\n".join(gh_summary))
485488

486489
else:

devops/scripts/benchmarks/main.py

Lines changed: 120 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,65 @@
3232
from presets import enabled_suites, presets
3333

3434

35+
def generate_github_summary(execution_stats, failures):
36+
"""Generate GitHub workflow summary with execution statistics"""
37+
gh_summary: list[str] = []
38+
gh_summary.append("### Benchmarks Execution")
39+
40+
# Overall statistics
41+
total_tests = execution_stats["total_tests"]
42+
passed_tests = execution_stats["tests_passed"]
43+
failed_tests = execution_stats["tests_failed"]
44+
warnings = execution_stats["warnings"]
45+
errors = len(failures)
46+
47+
gh_summary.append("#### Overall Statistics")
48+
gh_summary.append(f"- **Total Number of benchmarks:** {total_tests}")
49+
gh_summary.append(f"- **Tests Passed:** {passed_tests}")
50+
gh_summary.append(f"- **Tests Failed:** {failed_tests}")
51+
gh_summary.append(f"- **Errors:** {errors}")
52+
gh_summary.append(f"- **Warnings:** {warnings}")
53+
gh_summary.append("")
54+
55+
# Overall status of execution
56+
if failed_tests == 0 and errors == 0:
57+
gh_summary.append("#### ✅ Status: SUCCESS")
58+
gh_summary.append("Benchmarks seem to have executed successfully!")
59+
elif failed_tests > 0 or errors > 0:
60+
gh_summary.append("#### ❌ Status: FAILURES DETECTED")
61+
gh_summary.append("Some benchmarks failed or encountered errors.")
62+
63+
if warnings > 0:
64+
gh_summary.append("#### ⚠️ Status: WARNINGS DETECTED")
65+
gh_summary.append("Some benchmarks executed with warnings.")
66+
67+
gh_summary.append("")
68+
69+
# Detailed failures info
70+
if failures:
71+
gh_summary.append("#### Failure Details")
72+
gh_summary.append(
73+
f"<details><summary>{len(failures)} failed benchmarks:</summary>"
74+
)
75+
gh_summary.append("")
76+
77+
for benchmark_name, failure_reason in failures.items():
78+
gh_summary.append(f"##### {benchmark_name}")
79+
gh_summary.append(f"- **Reason:** {failure_reason}")
80+
gh_summary.append("")
81+
82+
gh_summary.append("</details>")
83+
gh_summary.append("")
84+
85+
# Write the summary to file
86+
try:
87+
with open(options.github_summary_filename, "w") as f:
88+
f.write("\n".join(gh_summary))
89+
log.info(f"GitHub summary written to {options.github_summary_filename}")
90+
except Exception as e:
91+
log.error(f"Failed to write GitHub summary: {e}")
92+
93+
3594
def run_iterations(
3695
benchmark: Benchmark,
3796
env_vars,
@@ -40,7 +99,12 @@ def run_iterations(
4099
failures: dict[str, str],
41100
run_trace: TracingType = TracingType.NONE,
42101
force_trace: bool = False,
43-
):
102+
) -> bool:
103+
"""
104+
Returns True if all iterations completed successfully, False otherwise.
105+
Unless options.exit_on_failure is set, then exception is raised.
106+
"""
107+
44108
for iter in range(iters):
45109
log.info(f"running {benchmark.name()}, iteration {iter}... ")
46110
try:
@@ -49,10 +113,10 @@ def run_iterations(
49113
)
50114
if bench_results is None:
51115
if options.exit_on_failure:
52-
raise RuntimeError(f"Benchmark produced no results!")
116+
raise RuntimeError("Benchmark produced no results!")
53117
else:
54118
failures[benchmark.name()] = "benchmark produced no results!"
55-
break
119+
return False
56120

57121
for bench_result in bench_results:
58122
log.info(
@@ -73,10 +137,15 @@ def run_iterations(
73137
f"Benchmark failed: {failure_label} verification failed: {str(e)}"
74138
)
75139
else:
76-
failures[failure_label] = f"verification failed: {str(e)}"
77-
log.error(f"complete ({failure_label}: verification failed: {str(e)}).")
140+
failures[failure_label] = (
141+
f"{failure_label}: verification failed: {str(e)}"
142+
)
143+
log.error(f"{failure_label}: verification failed: {str(e)}.")
78144
continue
79145

146+
# Iterations completed successfully
147+
return True
148+
80149

81150
# https://www.statology.org/modified-z-score/
82151
def modified_z_score(values: list[float]) -> list[float]:
@@ -110,7 +179,7 @@ def remove_outliers(
110179

111180

112181
def process_results(
113-
results: dict[str, list[Result]], stddev_threshold_override
182+
results: dict[str, list[Result]], stddev_threshold_override, execution_stats
114183
) -> tuple[bool, list[Result]]:
115184
processed: list[Result] = []
116185
# technically, we can detect whether result is below or above threshold per
@@ -142,6 +211,7 @@ def process_results(
142211
log.warning(
143212
f"stddev {stddev} above the threshold {threshold_scaled} ({threshold} times {mean_value}) for {label}"
144213
)
214+
execution_stats["warnings"] += 1
145215
valid_results = False
146216

147217
rlist.sort(key=lambda res: res.value)
@@ -170,7 +240,7 @@ def collect_metadata(suites):
170240
return metadata
171241

172242

173-
def main(directory, additional_env_vars, compare_names, filter):
243+
def main(directory, additional_env_vars, compare_names, filter, execution_stats):
174244
prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
175245

176246
if options.dry_run:
@@ -218,7 +288,7 @@ def main(directory, additional_env_vars, compare_names, filter):
218288

219289
# TODO: rename "s", rename setup in suite to suite_setup, rename setup in benchmark to benchmark_setup
220290
# TODO: do not add benchmarks whose suite setup failed
221-
# TODO: add a mode where we fail etire script in case of setup (or other) failures and use in CI
291+
# TODO: add a mode where we fail entire script in case of setup (or other) failures and use in CI
222292

223293
for s in suites:
224294
if s.name() not in enabled_suites(options.preset):
@@ -246,9 +316,9 @@ def main(directory, additional_env_vars, compare_names, filter):
246316
except Exception as e:
247317
if options.exit_on_failure:
248318
raise e
249-
failures[s.name()] = f"Suite setup failure: {e}"
319+
failures[s.name()] = f"Suite '{s.name()}' setup failure: {e}"
250320
log.error(
251-
f"{type(s).__name__} setup failed. Benchmarks won't be added."
321+
f"Suite {type(s).__name__} setup failed. Benchmarks won't be added."
252322
)
253323
log.error(f"failed: {e}")
254324
else:
@@ -265,19 +335,23 @@ def main(directory, additional_env_vars, compare_names, filter):
265335
if options.exit_on_failure:
266336
raise e
267337
else:
268-
failures[benchmark.name()] = f"Benchmark setup failure: {e}"
338+
failures[benchmark.name()] = (
339+
f"Benchmark '{benchmark.name()}' setup failure: {e}"
340+
)
269341
log.error(f"failed: {e}")
270342

271343
results = []
272344
if benchmarks:
273345
log.info(f"Running {len(benchmarks)} benchmarks...")
346+
execution_stats["total_tests"] = len(benchmarks)
274347
elif not options.dry_run:
275348
raise RuntimeError("No benchmarks to run.")
276349
for benchmark in benchmarks:
277350
try:
278351
merged_env_vars = {**additional_env_vars}
279352
intermediate_results: dict[str, list[Result]] = {}
280353
processed: list[Result] = []
354+
iterations_rc = False
281355

282356
# Determine if we should run regular benchmarks
283357
# Run regular benchmarks if:
@@ -292,7 +366,7 @@ def main(directory, additional_env_vars, compare_names, filter):
292366

293367
if should_run_regular:
294368
for _ in range(options.iterations_stddev):
295-
run_iterations(
369+
iterations_rc = run_iterations(
296370
benchmark,
297371
merged_env_vars,
298372
options.iterations,
@@ -301,7 +375,9 @@ def main(directory, additional_env_vars, compare_names, filter):
301375
run_trace=TracingType.NONE,
302376
)
303377
valid, processed = process_results(
304-
intermediate_results, benchmark.stddev_threshold()
378+
intermediate_results,
379+
benchmark.stddev_threshold(),
380+
execution_stats,
305381
)
306382
if valid:
307383
break
@@ -310,7 +386,7 @@ def main(directory, additional_env_vars, compare_names, filter):
310386
if options.unitrace and (
311387
benchmark.traceable(TracingType.UNITRACE) or args.unitrace == "force"
312388
):
313-
run_iterations(
389+
iterations_rc = run_iterations(
314390
benchmark,
315391
merged_env_vars,
316392
1,
@@ -324,7 +400,7 @@ def main(directory, additional_env_vars, compare_names, filter):
324400
benchmark.traceable(TracingType.FLAMEGRAPH)
325401
or args.flamegraph == "force"
326402
):
327-
run_iterations(
403+
iterations_rc = run_iterations(
328404
benchmark,
329405
merged_env_vars,
330406
1,
@@ -335,11 +411,18 @@ def main(directory, additional_env_vars, compare_names, filter):
335411
)
336412

337413
results += processed
414+
if iterations_rc:
415+
execution_stats["tests_passed"] += 1
416+
else:
417+
execution_stats["tests_failed"] += 1
338418
except Exception as e:
419+
execution_stats["tests_failed"] += 1
339420
if options.exit_on_failure:
340421
raise e
341422
else:
342-
failures[benchmark.name()] = f"Benchmark run failure: {e}"
423+
failures[benchmark.name()] = (
424+
f"Benchmark '{benchmark.name()}' run failure: {e}"
425+
)
343426
log.error(f"failed: {e}")
344427

345428
this_name = options.current_run_name
@@ -408,6 +491,10 @@ def main(directory, additional_env_vars, compare_names, filter):
408491
generate_html(history, compare_names, html_path, metadata)
409492
log.info(f"HTML with benchmark results has been generated")
410493

494+
# Generate GitHub summary
495+
if options.produce_github_summary:
496+
generate_github_summary(execution_stats, failures)
497+
411498
if options.exit_on_failure and failures:
412499
# just in case code missed to raise earlier
413500
raise RuntimeError(str(failures))
@@ -691,6 +778,12 @@ def validate_and_parse_env_args(env_args):
691778
help="Set the logging level",
692779
default="info",
693780
)
781+
parser.add_argument(
782+
"--produce-github-summary",
783+
action="store_true",
784+
help=f"Produce execution stats summary for Github workflow, in file '{options.github_summary_filename}'.",
785+
default=False,
786+
)
694787

695788
args = parser.parse_args()
696789
additional_env_vars = validate_and_parse_env_args(args.env)
@@ -724,6 +817,7 @@ def validate_and_parse_env_args(env_args):
724817
options.flamegraph = args.flamegraph is not None
725818
options.archive_baseline_days = args.archive_baseline_after
726819
options.archive_pr_days = args.archive_pr_after
820+
options.produce_github_summary = args.produce_github_summary
727821

728822
# Initialize logger with command line arguments
729823
initialize_logger(args.verbose, args.log_level)
@@ -738,6 +832,14 @@ def validate_and_parse_env_args(env_args):
738832
parser.error("Specified --output-dir is not a valid path")
739833
options.output_directory = os.path.abspath(args.output_dir)
740834

835+
# Initialize GitHub summary tracking
836+
execution_stats = {
837+
"total_tests": 0,
838+
"tests_passed": 0,
839+
"tests_failed": 0,
840+
"warnings": 0,
841+
}
842+
741843
# Options intended for CI:
742844
options.timestamp_override = args.timestamp_override
743845
if args.results_dir is not None:
@@ -780,6 +882,7 @@ def validate_and_parse_env_args(env_args):
780882
options.device_architecture = ""
781883
log.warning(f"Failed to fetch device architecture: {e}")
782884
log.warning("Defaulting to generic benchmark parameters.")
885+
execution_stats["warnings"] += 1
783886

784887
log.info(f"Selected device architecture: {options.device_architecture}")
785888

@@ -788,4 +891,5 @@ def validate_and_parse_env_args(env_args):
788891
additional_env_vars,
789892
args.compare,
790893
benchmark_filter,
894+
execution_stats,
791895
)

devops/scripts/benchmarks/options.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ class Options:
9797
# CI scripts vs SYCl build source.
9898
github_repo_override: str = None
9999
git_commit_override: str = None
100-
# Filename used to store Github summary files:
100+
# Flag and filename used to store Github summary files:
101+
produce_github_summary: bool = False
101102
github_summary_filename: str = "github_summary.md"
102103
# Archiving settings
103104
# Archived runs are stored separately from the main dataset but are still accessible

0 commit comments

Comments
 (0)