Skip to content

Commit d31ee7c

Browse files
committed
feat: add ab_plot.py script
Add a script to create tables and plots for performance runs. It works for both single and multiple runs and can generate pdf and html output. Signed-off-by: Egor Lazarchuk <yegorlz@amazon.co.uk>
1 parent ceeca6a commit d31ee7c

File tree

2 files changed

+305
-0
lines changed

2 files changed

+305
-0
lines changed

tests/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,15 @@ schedule an A/B-Test in buildkite, the `REVISION_A` and `REVISION_B` environment
251251
variables need to be set in the "Environment Variables" field under "Options" in
252252
buildkite's "New Build" modal.
253253

254+
### A/B visualization
255+
256+
To create visualization of A/B runs use `tools/ab_plot.py` script. Example
257+
usage:
258+
259+
```sh
260+
./tools/plot.py a_path b_path --output_type pdf
261+
```
262+
254263
### Beyond commit comparisons
255264

256265
While our automated A/B-Testing suite only supports A/B-Tests across commit

tools/ab_plot.py

Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import argparse
6+
import glob
7+
import json
8+
import time
9+
from pathlib import Path
10+
from typing import Callable, List
11+
12+
import matplotlib.pyplot as plt
13+
import numpy as np
14+
import pandas as pd
15+
import scipy
16+
import seaborn as sns
17+
from matplotlib.backends.backend_pdf import PdfPages
18+
19+
pd.set_option("display.float_format", "{:.2f}".format)
20+
21+
22+
def check_regression(
23+
a_samples: List[float],
24+
b_samples: List[float],
25+
statistic: Callable = np.mean,
26+
*,
27+
n_resamples=9999,
28+
):
29+
"""
30+
Check if 2 sample groups have a statistically big enough difference
31+
"""
32+
result = scipy.stats.permutation_test(
33+
(a_samples, b_samples),
34+
lambda x, y: statistic(y) - statistic(x),
35+
vectorized=False,
36+
n_resamples=n_resamples,
37+
)
38+
statistic_a = statistic(a_samples)
39+
40+
return result.pvalue, result.statistic / statistic_a, result.statistic
41+
42+
43+
def load_data(data_path: Path):
44+
"""
45+
Recursively collects `metrics.json` files in provided path
46+
"""
47+
data = []
48+
for name in glob.glob(f"{data_path}/**/metrics.json", recursive=True):
49+
with open(name, encoding="utf-8") as f:
50+
j = json.load(f)
51+
52+
if "performance_test" not in j["dimensions"]:
53+
print(f"skipping: {name}")
54+
continue
55+
56+
metrics = j["metrics"]
57+
perf_test = j["dimensions"]["performance_test"]
58+
del j["dimensions"]["performance_test"]
59+
dimentions = frozenset(j["dimensions"].items())
60+
61+
for m in metrics:
62+
if "cpu_utilization" in m:
63+
continue
64+
mm = metrics[m]
65+
unit = mm["unit"]
66+
values = mm["values"]
67+
for i, v in enumerate(values):
68+
data.append(
69+
{
70+
"index": i,
71+
"test": perf_test,
72+
"metric": m,
73+
"value": v,
74+
"unit": unit,
75+
"dimensions": dimentions,
76+
}
77+
)
78+
79+
return data
80+
81+
82+
def p50(a):
83+
"""Returns 50th percentile of 1d-array a"""
84+
return np.percentile(a, 50)
85+
86+
87+
def p90(a):
88+
"""Returns 90th percentile of 1d-array a"""
89+
return np.percentile(a, 90)
90+
91+
92+
def create_table(df: pd.DataFrame):
93+
"""Create an html table per test in the data frame"""
94+
95+
for test_value in df["test"].unique():
96+
df_test = df[df["test"] == test_value]
97+
98+
# Split dimensions into separate columns
99+
df_expanded = df_test.copy()
100+
dim_data = []
101+
for _, row in df_expanded.iterrows():
102+
dim_dict = dict(row["dimensions"])
103+
dim_data.append(dim_dict)
104+
105+
# Need to reset indexes because otherwise `pd.concat` will add NaN in all
106+
# rows where indexes differ
107+
dim_df = pd.DataFrame(dim_data).reset_index(drop=True)
108+
df_data = df_expanded.drop("dimensions", axis=1).reset_index(drop=True)
109+
df_expanded = pd.concat([df_data, dim_df], axis=1)
110+
111+
# Use dimension columns as index
112+
dim_cols = sorted(list(dim_df.columns))
113+
df_pivoted = df_expanded.pivot_table(
114+
values=["value"],
115+
index=["metric", "unit"] + dim_cols,
116+
columns="group",
117+
aggfunc=[p50, p90],
118+
)
119+
120+
# Add comparison columns for each group vs first group (A)
121+
groups = sorted(df_test["group"].unique())
122+
for baseline in groups:
123+
for group in groups:
124+
if group == baseline:
125+
continue
126+
for stat in ["p50", "p90"]:
127+
diff_col = (stat, "value", f"{baseline}->{group} %")
128+
df_pivoted[diff_col] = (
129+
(
130+
df_pivoted[(stat, "value", group)]
131+
- df_pivoted[(stat, "value", baseline)]
132+
)
133+
/ df_pivoted[(stat, "value", baseline)]
134+
* 100.0
135+
)
136+
diff_col = (stat, "value", f"{baseline}->{group} abs")
137+
df_pivoted[diff_col] = (
138+
df_pivoted[(stat, "value", group)]
139+
- df_pivoted[(stat, "value", baseline)]
140+
)
141+
142+
# Sort columns to have a persistent table representation
143+
df_pivoted = df_pivoted[sorted(df_pivoted.columns)]
144+
145+
test_output_path = f"{test_value}.html"
146+
with open(test_output_path, "w", encoding="UTF-8") as writer:
147+
writer.write("<br>")
148+
styled = df_pivoted.style.format(precision=2)
149+
styled = styled.set_table_attributes("border=1")
150+
styled = styled.set_table_styles(
151+
[{"selector": 'th:contains("->")', "props": [("min-width", "80px")]}]
152+
)
153+
154+
# Apply color gradient to all comparison columns
155+
for baseline in groups:
156+
for group in groups:
157+
if group == baseline:
158+
continue
159+
for stat in ["p50", "p90"]:
160+
diff_col = (stat, "value", f"{baseline}->{group} %")
161+
styled = styled.background_gradient(
162+
subset=[diff_col], cmap="RdYlGn"
163+
)
164+
165+
writer.write(styled.to_html())
166+
writer.write("<br>")
167+
print(f"Ready: {test_output_path}")
168+
169+
170+
def create_pdf(args, df: pd.DataFrame):
171+
"""Create a pdf per test in the data frame"""
172+
173+
sns.set_style("whitegrid")
174+
metrics = df["metric"].unique()
175+
n_groups = len(df["group"].unique())
176+
177+
for test_value in df["test"].unique():
178+
test_output_path = f"{test_value}.pdf"
179+
with PdfPages(test_output_path) as pdf:
180+
df_test = df[df["test"] == test_value]
181+
for dim_value in df_test["dimensions"].unique():
182+
for metric in metrics:
183+
metric_data = df_test[
184+
(df_test["metric"] == metric)
185+
& (df_test["dimensions"] == dim_value)
186+
]
187+
188+
if len(metric_data) == 0:
189+
continue
190+
191+
additional_title = ""
192+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
193+
if n_groups == 2:
194+
# Check if difference is significant
195+
a_values = metric_data[metric_data["group"] == "A"][
196+
"value"
197+
].values
198+
b_values = metric_data[metric_data["group"] == "B"][
199+
"value"
200+
].values
201+
pvalue, diff_rel, diff_abs = check_regression(
202+
a_values, b_values
203+
)
204+
205+
if (
206+
pvalue <= 0.1
207+
and abs(diff_rel) >= 0.05
208+
and abs(diff_abs) >= 0.0
209+
):
210+
fig.patch.set_facecolor("lightcoral")
211+
additional_title = (
212+
f"{diff_rel * 100:+.2f}% ({diff_abs:+.2f}) difference"
213+
)
214+
215+
# Make a multi-line title since single line will be too long
216+
dim_items = sorted(str(item) for item in dim_value)
217+
dim_chunks = [
218+
", ".join(dim_items[i : i + 4])
219+
for i in range(0, len(dim_items), 4)
220+
]
221+
dim_str = "\n".join(dim_chunks)
222+
title = f"{metric}\n{dim_str}\n{additional_title}"
223+
if additional_title:
224+
weight = "bold"
225+
else:
226+
weight = "normal"
227+
fig.suptitle(title, fontsize=10, weight=weight)
228+
229+
sns.boxenplot(data=metric_data, x="group", y="value", ax=ax1)
230+
ax1.set_ylabel(f"{metric} ({metric_data['unit'].iloc[0]})")
231+
232+
metric_data_indexed = metric_data.reset_index()
233+
errorbar = (args.errorbar[0], int(args.errorbar[1]))
234+
sns.lineplot(
235+
data=metric_data_indexed,
236+
x="index",
237+
y="value",
238+
hue="group",
239+
ax=ax2,
240+
errorbar=errorbar,
241+
)
242+
ax2.set_ylabel(f"{metric} ({metric_data['unit'].iloc[0]})")
243+
244+
plt.tight_layout()
245+
pdf.savefig()
246+
plt.close()
247+
print(f"Ready: {test_output_path}")
248+
249+
250+
if __name__ == "__main__":
251+
parser = argparse.ArgumentParser(
252+
description="Executes Firecracker's A/B testsuite across the specified commits"
253+
)
254+
parser.add_argument(
255+
"paths",
256+
nargs="+",
257+
help="Paths to directories with test runs",
258+
type=Path,
259+
)
260+
parser.add_argument(
261+
"--errorbar",
262+
nargs=2,
263+
default=["pi", "95"],
264+
help="Errorbar configuration for lineplot (type, value)",
265+
)
266+
parser.add_argument(
267+
"--output_type",
268+
default=["pdf"],
269+
help="Type of the output to generate",
270+
)
271+
args = parser.parse_args()
272+
273+
# Data retrieval
274+
start_time = time.time()
275+
all_data = []
276+
for i, path in enumerate(args.paths):
277+
data = load_data(path)
278+
print(f"getting data {i} from {path}: {len(data)}")
279+
df = pd.DataFrame(data)
280+
df["group"] = chr(65 + i) # A, B, C, D, ...
281+
all_data.append(df)
282+
print(f"Data retrieval: {time.time() - start_time:.2f}s")
283+
284+
# Data processing
285+
start_time = time.time()
286+
df_combined = pd.concat(all_data, ignore_index=True)
287+
print(f"Data processing: {time.time() - start_time:.2f}s")
288+
289+
# Plotting
290+
start_time = time.time()
291+
if args.output_type == "pdf":
292+
create_pdf(args, df_combined)
293+
if args.output_type == "table":
294+
create_table(df_combined)
295+
296+
print(f"Plotting: {time.time() - start_time:.2f}s")

0 commit comments

Comments
 (0)