|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
| 5 | +import argparse |
| 6 | +import glob |
| 7 | +import json |
| 8 | +import time |
| 9 | +from pathlib import Path |
| 10 | +from typing import Callable, List |
| 11 | + |
| 12 | +import matplotlib.pyplot as plt |
| 13 | +import numpy as np |
| 14 | +import pandas as pd |
| 15 | +import scipy |
| 16 | +import seaborn as sns |
| 17 | +from matplotlib.backends.backend_pdf import PdfPages |
| 18 | + |
| 19 | +pd.set_option("display.float_format", "{:.2f}".format) |
| 20 | + |
| 21 | + |
| 22 | +def check_regression( |
| 23 | + a_samples: List[float], |
| 24 | + b_samples: List[float], |
| 25 | + statistic: Callable = np.mean, |
| 26 | + *, |
| 27 | + n_resamples=9999, |
| 28 | +): |
| 29 | + """ |
| 30 | + Check if 2 sample groups have a statistically big enough difference |
| 31 | + """ |
| 32 | + result = scipy.stats.permutation_test( |
| 33 | + (a_samples, b_samples), |
| 34 | + lambda x, y: statistic(y) - statistic(x), |
| 35 | + vectorized=False, |
| 36 | + n_resamples=n_resamples, |
| 37 | + ) |
| 38 | + statistic_a = statistic(a_samples) |
| 39 | + |
| 40 | + return result.pvalue, result.statistic / statistic_a, result.statistic |
| 41 | + |
| 42 | + |
| 43 | +def load_data(data_path: Path): |
| 44 | + """ |
| 45 | + Recursively collects `metrics.json` files in provided path |
| 46 | + """ |
| 47 | + data = [] |
| 48 | + for name in glob.glob(f"{data_path}/**/metrics.json", recursive=True): |
| 49 | + with open(name, encoding="utf-8") as f: |
| 50 | + j = json.load(f) |
| 51 | + |
| 52 | + if "performance_test" not in j["dimensions"]: |
| 53 | + print(f"skipping: {name}") |
| 54 | + continue |
| 55 | + |
| 56 | + metrics = j["metrics"] |
| 57 | + perf_test = j["dimensions"]["performance_test"] |
| 58 | + del j["dimensions"]["performance_test"] |
| 59 | + dimentions = frozenset(j["dimensions"].items()) |
| 60 | + |
| 61 | + for m in metrics: |
| 62 | + if "cpu_utilization" in m: |
| 63 | + continue |
| 64 | + mm = metrics[m] |
| 65 | + unit = mm["unit"] |
| 66 | + values = mm["values"] |
| 67 | + for i, v in enumerate(values): |
| 68 | + data.append( |
| 69 | + { |
| 70 | + "index": i, |
| 71 | + "test": perf_test, |
| 72 | + "metric": m, |
| 73 | + "value": v, |
| 74 | + "unit": unit, |
| 75 | + "dimensions": dimentions, |
| 76 | + } |
| 77 | + ) |
| 78 | + |
| 79 | + return data |
| 80 | + |
| 81 | + |
| 82 | +def p50(a): |
| 83 | + """Returns 50th percentile of 1d-array a""" |
| 84 | + return np.percentile(a, 50) |
| 85 | + |
| 86 | + |
| 87 | +def p90(a): |
| 88 | + """Returns 90th percentile of 1d-array a""" |
| 89 | + return np.percentile(a, 90) |
| 90 | + |
| 91 | + |
| 92 | +def create_table(df: pd.DataFrame): |
| 93 | + """Create an html table per test in the data frame""" |
| 94 | + |
| 95 | + for test_value in df["test"].unique(): |
| 96 | + df_test = df[df["test"] == test_value] |
| 97 | + |
| 98 | + # Split dimensions into separate columns |
| 99 | + df_expanded = df_test.copy() |
| 100 | + dim_data = [] |
| 101 | + for _, row in df_expanded.iterrows(): |
| 102 | + dim_dict = dict(row["dimensions"]) |
| 103 | + dim_data.append(dim_dict) |
| 104 | + |
| 105 | + # Need to reset indexes because otherwise `pd.concat` will add NaN in all |
| 106 | + # rows where indexes differ |
| 107 | + dim_df = pd.DataFrame(dim_data).reset_index(drop=True) |
| 108 | + df_data = df_expanded.drop("dimensions", axis=1).reset_index(drop=True) |
| 109 | + df_expanded = pd.concat([df_data, dim_df], axis=1) |
| 110 | + |
| 111 | + # Use dimension columns as index |
| 112 | + dim_cols = sorted(list(dim_df.columns)) |
| 113 | + df_pivoted = df_expanded.pivot_table( |
| 114 | + values=["value"], |
| 115 | + index=["metric", "unit"] + dim_cols, |
| 116 | + columns="group", |
| 117 | + aggfunc=[p50, p90], |
| 118 | + ) |
| 119 | + |
| 120 | + # Add comparison columns for each group vs first group (A) |
| 121 | + groups = sorted(df_test["group"].unique()) |
| 122 | + for baseline in groups: |
| 123 | + for group in groups: |
| 124 | + if group == baseline: |
| 125 | + continue |
| 126 | + for stat in ["p50", "p90"]: |
| 127 | + diff_col = (stat, "value", f"{baseline}->{group} %") |
| 128 | + df_pivoted[diff_col] = ( |
| 129 | + ( |
| 130 | + df_pivoted[(stat, "value", group)] |
| 131 | + - df_pivoted[(stat, "value", baseline)] |
| 132 | + ) |
| 133 | + / df_pivoted[(stat, "value", baseline)] |
| 134 | + * 100.0 |
| 135 | + ) |
| 136 | + diff_col = (stat, "value", f"{baseline}->{group} abs") |
| 137 | + df_pivoted[diff_col] = ( |
| 138 | + df_pivoted[(stat, "value", group)] |
| 139 | + - df_pivoted[(stat, "value", baseline)] |
| 140 | + ) |
| 141 | + |
| 142 | + # Sort columns to have a persistent table representation |
| 143 | + df_pivoted = df_pivoted[sorted(df_pivoted.columns)] |
| 144 | + |
| 145 | + test_output_path = f"{test_value}.html" |
| 146 | + with open(test_output_path, "w", encoding="UTF-8") as writer: |
| 147 | + writer.write("<br>") |
| 148 | + styled = df_pivoted.style.format(precision=2) |
| 149 | + styled = styled.set_table_attributes("border=1") |
| 150 | + styled = styled.set_table_styles( |
| 151 | + [{"selector": 'th:contains("->")', "props": [("min-width", "80px")]}] |
| 152 | + ) |
| 153 | + |
| 154 | + # Apply color gradient to all comparison columns |
| 155 | + for baseline in groups: |
| 156 | + for group in groups: |
| 157 | + if group == baseline: |
| 158 | + continue |
| 159 | + for stat in ["p50", "p90"]: |
| 160 | + diff_col = (stat, "value", f"{baseline}->{group} %") |
| 161 | + styled = styled.background_gradient( |
| 162 | + subset=[diff_col], cmap="RdYlGn" |
| 163 | + ) |
| 164 | + |
| 165 | + writer.write(styled.to_html()) |
| 166 | + writer.write("<br>") |
| 167 | + print(f"Ready: {test_output_path}") |
| 168 | + |
| 169 | + |
| 170 | +def create_pdf(args, df: pd.DataFrame): |
| 171 | + """Create a pdf per test in the data frame""" |
| 172 | + |
| 173 | + sns.set_style("whitegrid") |
| 174 | + metrics = df["metric"].unique() |
| 175 | + n_groups = len(df["group"].unique()) |
| 176 | + |
| 177 | + for test_value in df["test"].unique(): |
| 178 | + test_output_path = f"{test_value}.pdf" |
| 179 | + with PdfPages(test_output_path) as pdf: |
| 180 | + df_test = df[df["test"] == test_value] |
| 181 | + for dim_value in df_test["dimensions"].unique(): |
| 182 | + for metric in metrics: |
| 183 | + metric_data = df_test[ |
| 184 | + (df_test["metric"] == metric) |
| 185 | + & (df_test["dimensions"] == dim_value) |
| 186 | + ] |
| 187 | + |
| 188 | + if len(metric_data) == 0: |
| 189 | + continue |
| 190 | + |
| 191 | + additional_title = "" |
| 192 | + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) |
| 193 | + if n_groups == 2: |
| 194 | + # Check if difference is significant |
| 195 | + a_values = metric_data[metric_data["group"] == "A"][ |
| 196 | + "value" |
| 197 | + ].values |
| 198 | + b_values = metric_data[metric_data["group"] == "B"][ |
| 199 | + "value" |
| 200 | + ].values |
| 201 | + pvalue, diff_rel, diff_abs = check_regression( |
| 202 | + a_values, b_values |
| 203 | + ) |
| 204 | + |
| 205 | + if ( |
| 206 | + pvalue <= 0.1 |
| 207 | + and abs(diff_rel) >= 0.05 |
| 208 | + and abs(diff_abs) >= 0.0 |
| 209 | + ): |
| 210 | + fig.patch.set_facecolor("lightcoral") |
| 211 | + additional_title = ( |
| 212 | + f"{diff_rel * 100:+.2f}% ({diff_abs:+.2f}) difference" |
| 213 | + ) |
| 214 | + |
| 215 | + # Make a multi-line title since single line will be too long |
| 216 | + dim_items = sorted(str(item) for item in dim_value) |
| 217 | + dim_chunks = [ |
| 218 | + ", ".join(dim_items[i : i + 4]) |
| 219 | + for i in range(0, len(dim_items), 4) |
| 220 | + ] |
| 221 | + dim_str = "\n".join(dim_chunks) |
| 222 | + title = f"{metric}\n{dim_str}\n{additional_title}" |
| 223 | + if additional_title: |
| 224 | + weight = "bold" |
| 225 | + else: |
| 226 | + weight = "normal" |
| 227 | + fig.suptitle(title, fontsize=10, weight=weight) |
| 228 | + |
| 229 | + sns.boxenplot(data=metric_data, x="group", y="value", ax=ax1) |
| 230 | + ax1.set_ylabel(f"{metric} ({metric_data['unit'].iloc[0]})") |
| 231 | + |
| 232 | + metric_data_indexed = metric_data.reset_index() |
| 233 | + errorbar = (args.errorbar[0], int(args.errorbar[1])) |
| 234 | + sns.lineplot( |
| 235 | + data=metric_data_indexed, |
| 236 | + x="index", |
| 237 | + y="value", |
| 238 | + hue="group", |
| 239 | + ax=ax2, |
| 240 | + errorbar=errorbar, |
| 241 | + ) |
| 242 | + ax2.set_ylabel(f"{metric} ({metric_data['unit'].iloc[0]})") |
| 243 | + |
| 244 | + plt.tight_layout() |
| 245 | + pdf.savefig() |
| 246 | + plt.close() |
| 247 | + print(f"Ready: {test_output_path}") |
| 248 | + |
| 249 | + |
| 250 | +if __name__ == "__main__": |
| 251 | + parser = argparse.ArgumentParser( |
| 252 | + description="Executes Firecracker's A/B testsuite across the specified commits" |
| 253 | + ) |
| 254 | + parser.add_argument( |
| 255 | + "paths", |
| 256 | + nargs="+", |
| 257 | + help="Paths to directories with test runs", |
| 258 | + type=Path, |
| 259 | + ) |
| 260 | + parser.add_argument( |
| 261 | + "--errorbar", |
| 262 | + nargs=2, |
| 263 | + default=["pi", "95"], |
| 264 | + help="Errorbar configuration for lineplot (type, value)", |
| 265 | + ) |
| 266 | + parser.add_argument( |
| 267 | + "--output_type", |
| 268 | + default=["pdf"], |
| 269 | + help="Type of the output to generate", |
| 270 | + ) |
| 271 | + args = parser.parse_args() |
| 272 | + |
| 273 | + # Data retrieval |
| 274 | + start_time = time.time() |
| 275 | + all_data = [] |
| 276 | + for i, path in enumerate(args.paths): |
| 277 | + data = load_data(path) |
| 278 | + print(f"getting data {i} from {path}: {len(data)}") |
| 279 | + df = pd.DataFrame(data) |
| 280 | + df["group"] = chr(65 + i) # A, B, C, D, ... |
| 281 | + all_data.append(df) |
| 282 | + print(f"Data retrieval: {time.time() - start_time:.2f}s") |
| 283 | + |
| 284 | + # Data processing |
| 285 | + start_time = time.time() |
| 286 | + df_combined = pd.concat(all_data, ignore_index=True) |
| 287 | + print(f"Data processing: {time.time() - start_time:.2f}s") |
| 288 | + |
| 289 | + # Plotting |
| 290 | + start_time = time.time() |
| 291 | + if args.output_type == "pdf": |
| 292 | + create_pdf(args, df_combined) |
| 293 | + if args.output_type == "table": |
| 294 | + create_table(df_combined) |
| 295 | + |
| 296 | + print(f"Plotting: {time.time() - start_time:.2f}s") |
0 commit comments