3232from presets import enabled_suites , presets
3333
3434
35+ def generate_github_summary (execution_stats , failures ):
36+ """Generate GitHub workflow summary with execution statistics"""
37+ gh_summary : list [str ] = []
38+ gh_summary .append ("### Benchmarks Execution" )
39+
40+ # Overall statistics
41+ total_tests = execution_stats ["total_tests" ]
42+ passed_tests = execution_stats ["tests_passed" ]
43+ failed_tests = execution_stats ["tests_failed" ]
44+ warnings = execution_stats ["warnings" ]
45+ errors = len (failures )
46+
47+ gh_summary .append ("#### Overall Statistics" )
48+ gh_summary .append (f"- **Total Number of benchmarks:** { total_tests } " )
49+ gh_summary .append (f"- **Tests Passed:** { passed_tests } " )
50+ gh_summary .append (f"- **Tests Failed:** { failed_tests } " )
51+ gh_summary .append (f"- **Errors:** { errors } " )
52+ gh_summary .append (f"- **Warnings:** { warnings } " )
53+ gh_summary .append ("" )
54+
55+ # Overall status of execution
56+ if failed_tests == 0 and errors == 0 :
57+ gh_summary .append ("#### ✅ Status: SUCCESS" )
58+ gh_summary .append ("Benchmarks seem to have executed successfully!" )
59+ elif failed_tests > 0 or errors > 0 :
60+ gh_summary .append ("#### ❌ Status: FAILURES DETECTED" )
61+ gh_summary .append ("Some benchmarks failed or encountered errors." )
62+
63+ if warnings > 0 :
64+ gh_summary .append ("#### ⚠️ Status: WARNINGS DETECTED" )
65+ gh_summary .append ("Some benchmarks executed with warnings." )
66+
67+ gh_summary .append ("" )
68+
69+ # Detailed failures info
70+ if failures :
71+ gh_summary .append ("#### Failure Details" )
72+ gh_summary .append (
73+ f"<details><summary>{ len (failures )} failed benchmarks:</summary>"
74+ )
75+ gh_summary .append ("" )
76+
77+ for benchmark_name , failure_reason in failures .items ():
78+ gh_summary .append (f"##### { benchmark_name } " )
79+ gh_summary .append (f"- **Reason:** { failure_reason } " )
80+ gh_summary .append ("" )
81+
82+ gh_summary .append ("</details>" )
83+ gh_summary .append ("" )
84+
85+ # Write the summary to file
86+ try :
87+ with open (options .github_summary_filename , "w" ) as f :
88+ f .write ("\n " .join (gh_summary ))
89+ log .info (f"GitHub summary written to { options .github_summary_filename } " )
90+ except Exception as e :
91+ log .error (f"Failed to write GitHub summary: { e } " )
92+
93+
3594def run_iterations (
3695 benchmark : Benchmark ,
3796 env_vars ,
@@ -40,7 +99,12 @@ def run_iterations(
4099 failures : dict [str , str ],
41100 run_trace : TracingType = TracingType .NONE ,
42101 force_trace : bool = False ,
43- ):
102+ ) -> bool :
103+ """
104+ Returns True if all iterations completed successfully, False otherwise.
105+ Unless options.exit_on_failure is set, then exception is raised.
106+ """
107+
44108 for iter in range (iters ):
45109 log .info (f"running { benchmark .name ()} , iteration { iter } ... " )
46110 try :
@@ -49,10 +113,10 @@ def run_iterations(
49113 )
50114 if bench_results is None :
51115 if options .exit_on_failure :
52- raise RuntimeError (f "Benchmark produced no results!" )
116+ raise RuntimeError ("Benchmark produced no results!" )
53117 else :
54118 failures [benchmark .name ()] = "benchmark produced no results!"
55- break
119+ return False
56120
57121 for bench_result in bench_results :
58122 log .info (
@@ -73,10 +137,15 @@ def run_iterations(
73137 f"Benchmark failed: { failure_label } verification failed: { str (e )} "
74138 )
75139 else :
76- failures [failure_label ] = f"verification failed: { str (e )} "
77- log .error (f"complete ({ failure_label } : verification failed: { str (e )} )." )
140+ failures [failure_label ] = (
141+ f"{ failure_label } : verification failed: { str (e )} "
142+ )
143+ log .error (f"{ failure_label } : verification failed: { str (e )} ." )
78144 continue
79145
146+ # Iterations completed successfully
147+ return True
148+
80149
81150# https://www.statology.org/modified-z-score/
82151def modified_z_score (values : list [float ]) -> list [float ]:
@@ -110,7 +179,7 @@ def remove_outliers(
110179
111180
112181def process_results (
113- results : dict [str , list [Result ]], stddev_threshold_override
182+ results : dict [str , list [Result ]], stddev_threshold_override , execution_stats
114183) -> tuple [bool , list [Result ]]:
115184 processed : list [Result ] = []
116185 # technically, we can detect whether result is below or above threshold per
@@ -142,6 +211,7 @@ def process_results(
142211 log .warning (
143212 f"stddev { stddev } above the threshold { threshold_scaled } ({ threshold } times { mean_value } ) for { label } "
144213 )
214+ execution_stats ["warnings" ] += 1
145215 valid_results = False
146216
147217 rlist .sort (key = lambda res : res .value )
@@ -170,7 +240,7 @@ def collect_metadata(suites):
170240 return metadata
171241
172242
173- def main (directory , additional_env_vars , compare_names , filter ):
243+ def main (directory , additional_env_vars , compare_names , filter , execution_stats ):
174244 prepare_workdir (directory , INTERNAL_WORKDIR_VERSION )
175245
176246 if options .dry_run :
@@ -218,7 +288,7 @@ def main(directory, additional_env_vars, compare_names, filter):
218288
219289 # TODO: rename "s", rename setup in suite to suite_setup, rename setup in benchmark to benchmark_setup
220290 # TODO: do not add benchmarks whose suite setup failed
221- # TODO: add a mode where we fail etire script in case of setup (or other) failures and use in CI
291+ # TODO: add a mode where we fail entire script in case of setup (or other) failures and use in CI
222292
223293 for s in suites :
224294 if s .name () not in enabled_suites (options .preset ):
@@ -246,9 +316,9 @@ def main(directory, additional_env_vars, compare_names, filter):
246316 except Exception as e :
247317 if options .exit_on_failure :
248318 raise e
249- failures [s .name ()] = f"Suite setup failure: { e } "
319+ failures [s .name ()] = f"Suite ' { s . name () } ' setup failure: { e } "
250320 log .error (
251- f"{ type (s ).__name__ } setup failed. Benchmarks won't be added."
321+ f"Suite { type (s ).__name__ } setup failed. Benchmarks won't be added."
252322 )
253323 log .error (f"failed: { e } " )
254324 else :
@@ -265,19 +335,23 @@ def main(directory, additional_env_vars, compare_names, filter):
265335 if options .exit_on_failure :
266336 raise e
267337 else :
268- failures [benchmark .name ()] = f"Benchmark setup failure: { e } "
338+ failures [benchmark .name ()] = (
339+ f"Benchmark '{ benchmark .name ()} ' setup failure: { e } "
340+ )
269341 log .error (f"failed: { e } " )
270342
271343 results = []
272344 if benchmarks :
273345 log .info (f"Running { len (benchmarks )} benchmarks..." )
346+ execution_stats ["total_tests" ] = len (benchmarks )
274347 elif not options .dry_run :
275348 raise RuntimeError ("No benchmarks to run." )
276349 for benchmark in benchmarks :
277350 try :
278351 merged_env_vars = {** additional_env_vars }
279352 intermediate_results : dict [str , list [Result ]] = {}
280353 processed : list [Result ] = []
354+ iterations_rc = False
281355
282356 # Determine if we should run regular benchmarks
283357 # Run regular benchmarks if:
@@ -292,7 +366,7 @@ def main(directory, additional_env_vars, compare_names, filter):
292366
293367 if should_run_regular :
294368 for _ in range (options .iterations_stddev ):
295- run_iterations (
369+ iterations_rc = run_iterations (
296370 benchmark ,
297371 merged_env_vars ,
298372 options .iterations ,
@@ -301,7 +375,9 @@ def main(directory, additional_env_vars, compare_names, filter):
301375 run_trace = TracingType .NONE ,
302376 )
303377 valid , processed = process_results (
304- intermediate_results , benchmark .stddev_threshold ()
378+ intermediate_results ,
379+ benchmark .stddev_threshold (),
380+ execution_stats ,
305381 )
306382 if valid :
307383 break
@@ -310,7 +386,7 @@ def main(directory, additional_env_vars, compare_names, filter):
310386 if options .unitrace and (
311387 benchmark .traceable (TracingType .UNITRACE ) or args .unitrace == "force"
312388 ):
313- run_iterations (
389+ iterations_rc = run_iterations (
314390 benchmark ,
315391 merged_env_vars ,
316392 1 ,
@@ -324,7 +400,7 @@ def main(directory, additional_env_vars, compare_names, filter):
324400 benchmark .traceable (TracingType .FLAMEGRAPH )
325401 or args .flamegraph == "force"
326402 ):
327- run_iterations (
403+ iterations_rc = run_iterations (
328404 benchmark ,
329405 merged_env_vars ,
330406 1 ,
@@ -335,11 +411,18 @@ def main(directory, additional_env_vars, compare_names, filter):
335411 )
336412
337413 results += processed
414+ if iterations_rc :
415+ execution_stats ["tests_passed" ] += 1
416+ else :
417+ execution_stats ["tests_failed" ] += 1
338418 except Exception as e :
419+ execution_stats ["tests_failed" ] += 1
339420 if options .exit_on_failure :
340421 raise e
341422 else :
342- failures [benchmark .name ()] = f"Benchmark run failure: { e } "
423+ failures [benchmark .name ()] = (
424+ f"Benchmark '{ benchmark .name ()} ' run failure: { e } "
425+ )
343426 log .error (f"failed: { e } " )
344427
345428 this_name = options .current_run_name
@@ -408,6 +491,10 @@ def main(directory, additional_env_vars, compare_names, filter):
408491 generate_html (history , compare_names , html_path , metadata )
409492 log .info (f"HTML with benchmark results has been generated" )
410493
494+ # Generate GitHub summary
495+ if options .produce_github_summary :
496+ generate_github_summary (execution_stats , failures )
497+
411498 if options .exit_on_failure and failures :
412499 # just in case code missed to raise earlier
413500 raise RuntimeError (str (failures ))
@@ -691,6 +778,12 @@ def validate_and_parse_env_args(env_args):
691778 help = "Set the logging level" ,
692779 default = "info" ,
693780 )
781+ parser .add_argument (
782+ "--produce-github-summary" ,
783+ action = "store_true" ,
784+ help = f"Produce execution stats summary for Github workflow, in file '{ options .github_summary_filename } '." ,
785+ default = False ,
786+ )
694787
695788 args = parser .parse_args ()
696789 additional_env_vars = validate_and_parse_env_args (args .env )
@@ -724,6 +817,7 @@ def validate_and_parse_env_args(env_args):
724817 options .flamegraph = args .flamegraph is not None
725818 options .archive_baseline_days = args .archive_baseline_after
726819 options .archive_pr_days = args .archive_pr_after
820+ options .produce_github_summary = args .produce_github_summary
727821
728822 # Initialize logger with command line arguments
729823 initialize_logger (args .verbose , args .log_level )
@@ -738,6 +832,14 @@ def validate_and_parse_env_args(env_args):
738832 parser .error ("Specified --output-dir is not a valid path" )
739833 options .output_directory = os .path .abspath (args .output_dir )
740834
835+ # Initialize GitHub summary tracking
836+ execution_stats = {
837+ "total_tests" : 0 ,
838+ "tests_passed" : 0 ,
839+ "tests_failed" : 0 ,
840+ "warnings" : 0 ,
841+ }
842+
741843 # Options intended for CI:
742844 options .timestamp_override = args .timestamp_override
743845 if args .results_dir is not None :
@@ -780,6 +882,7 @@ def validate_and_parse_env_args(env_args):
780882 options .device_architecture = ""
781883 log .warning (f"Failed to fetch device architecture: { e } " )
782884 log .warning ("Defaulting to generic benchmark parameters." )
885+ execution_stats ["warnings" ] += 1
783886
784887 log .info (f"Selected device architecture: { options .device_architecture } " )
785888
@@ -788,4 +891,5 @@ def validate_and_parse_env_args(env_args):
788891 additional_env_vars ,
789892 args .compare ,
790893 benchmark_filter ,
894+ execution_stats ,
791895 )
0 commit comments