Skip to content

Commit 5c2b67c

Browse files
Mossakaclaude
andauthored
feat: add historical benchmark storage and trend reporting (#1874)
* feat: add historical benchmark storage and trend reporting Store benchmark results in an orphan `benchmark-data` branch as benchmarks/history.json (last 50 entries). Each daily run appends its results, enabling trend analysis and relative regression detection. New: - scripts/ci/benchmark-trend.ts: generates Markdown trend table showing latest-vs-previous deltas and historical p95 values - Workflow steps: fetch history, append results, commit to orphan branch, generate trend report in Step Summary - benchmarks/.gitkeep: placeholder for local history file - package.json: add benchmark:trend script Workflow changes: - permissions: contents: write (push to benchmark-data branch) - checkout: fetch-depth: 0 (full history for branch operations) Closes #1866 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: address review feedback on benchmark history PR - Validate history.json on fetch, fall back to [] if corrupted - Add concurrency group to prevent overlapping benchmark runs - Validate --last CLI argument (reject NaN/negative, use default) - Add unit tests for delta computation (7 tests) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 38cefb6 commit 5c2b67c

5 files changed

Lines changed: 399 additions & 1 deletion

File tree

.github/workflows/performance-monitor.yml

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,12 @@ on:
1111
required: false
1212
default: "30"
1313

14+
concurrency:
15+
group: performance-benchmark
16+
cancel-in-progress: false # Let the running benchmark finish; queue the next one
17+
1418
permissions:
15-
contents: read
19+
contents: write # Required to push to benchmark-data branch
1620
issues: write
1721

1822
jobs:
@@ -23,6 +27,8 @@ jobs:
2327
steps:
2428
- name: Checkout
2529
uses: actions/checkout@v4
30+
with:
31+
fetch-depth: 0
2632

2733
- name: Setup Node.js
2834
uses: actions/setup-node@v4
@@ -44,6 +50,21 @@ jobs:
4450
WRAPPER
4551
sudo chmod +x /usr/local/bin/awf
4652
53+
- name: Fetch benchmark history
54+
run: |
55+
mkdir -p benchmarks
56+
if git fetch origin benchmark-data 2>/dev/null; then
57+
git show origin/benchmark-data:benchmarks/history.json > benchmarks/history.json 2>/dev/null || echo '[]' > benchmarks/history.json
58+
else
59+
echo '[]' > benchmarks/history.json
60+
fi
61+
# Validate JSON; fall back to empty array if corrupted
62+
if ! jq empty benchmarks/history.json 2>/dev/null; then
63+
echo "WARNING: history.json is corrupted, resetting to empty"
64+
echo '[]' > benchmarks/history.json
65+
fi
66+
echo "History entries: $(jq 'length' benchmarks/history.json)"
67+
4768
- name: Run benchmarks
4869
id: benchmark
4970
env:
@@ -76,6 +97,50 @@ jobs:
7697
exit 1
7798
fi
7899
100+
- name: Append to benchmark history
101+
run: |
102+
HISTORY_FILE="benchmarks/history.json"
103+
if ! jq -e '.results' benchmark-results.json > /dev/null 2>&1; then
104+
echo "Invalid or missing benchmark results — skipping history update"
105+
exit 0
106+
fi
107+
ENTRY=$(jq '{timestamp, commitSha, iterations, results, regressions}' benchmark-results.json)
108+
UPDATED=$(jq --argjson entry "$ENTRY" '. + [$entry]' "$HISTORY_FILE")
109+
# Keep only last 50 entries
110+
echo "$UPDATED" | jq '.[-50:]' > "$HISTORY_FILE"
111+
echo "History now has $(jq 'length' "$HISTORY_FILE") entries"
112+
113+
- name: Commit benchmark history
114+
run: |
115+
git config user.name "github-actions[bot]"
116+
git config user.email "github-actions[bot]@users.noreply.github.com"
117+
cp benchmarks/history.json /tmp/history.json
118+
cp benchmark-results.json /tmp/benchmark-results.json
119+
if git fetch origin benchmark-data 2>/dev/null; then
120+
git checkout benchmark-data
121+
else
122+
git checkout --orphan benchmark-data
123+
git rm -rf . 2>/dev/null || true
124+
fi
125+
mkdir -p benchmarks
126+
cp /tmp/history.json benchmarks/history.json
127+
git add benchmarks/history.json
128+
if git diff --cached --quiet; then
129+
echo "No changes to commit"
130+
else
131+
git commit -m "chore: update benchmark history [skip ci]"
132+
git push origin benchmark-data
133+
fi
134+
135+
- name: Restore main branch
136+
run: |
137+
git checkout ${{ github.sha }}
138+
cp /tmp/history.json benchmarks/history.json
139+
cp /tmp/benchmark-results.json benchmark-results.json
140+
141+
- name: Generate trend report
142+
run: npx tsx scripts/ci/benchmark-trend.ts >> "$GITHUB_STEP_SUMMARY"
143+
79144
- name: Check for regressions
80145
id: check
81146
run: |

benchmarks/.gitkeep

Whitespace-only changes.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"docs:build": "cd docs-site && npm run build",
2626
"docs:preview": "cd docs-site && npm run preview",
2727
"benchmark": "npx tsx scripts/ci/benchmark-performance.ts",
28+
"benchmark:trend": "npx tsx scripts/ci/benchmark-trend.ts",
2829
"build:bundle": "npm run build && node scripts/build-bundle.mjs"
2930
},
3031
"keywords": [

scripts/ci/benchmark-trend.test.ts

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
/**
2+
* Unit tests for benchmark-trend.ts logic.
3+
*
4+
* Tests the core functions: delta computation and Markdown formatting.
5+
* The script's main() reads from disk and argv, so we test the pure functions directly.
6+
*/
7+
8+
// Re-implement the pure functions here since the script isn't structured as a library.
9+
// This mirrors the logic in benchmark-trend.ts without the CLI/file I/O.
10+
11+
interface BenchmarkResult {
12+
metric: string;
13+
unit: string;
14+
values: number[];
15+
mean: number;
16+
median: number;
17+
p95: number;
18+
p99: number;
19+
}
20+
21+
interface HistoryEntry {
22+
timestamp: string;
23+
commitSha: string;
24+
iterations: number;
25+
results: BenchmarkResult[];
26+
regressions: string[];
27+
}
28+
29+
interface MetricDelta {
30+
metric: string;
31+
unit: string;
32+
current: number;
33+
previous: number;
34+
delta: number;
35+
deltaPercent: number;
36+
regression: boolean;
37+
}
38+
39+
const REGRESSION_THRESHOLD_PERCENT = 20;
40+
41+
function computeDeltas(current: HistoryEntry, previous: HistoryEntry): MetricDelta[] {
42+
const deltas: MetricDelta[] = [];
43+
for (const cur of current.results) {
44+
const prev = previous.results.find((r) => r.metric === cur.metric);
45+
if (!prev) continue;
46+
const delta = cur.p95 - prev.p95;
47+
const deltaPercent = prev.p95 === 0 ? 0 : (delta / prev.p95) * 100;
48+
deltas.push({
49+
metric: cur.metric,
50+
unit: cur.unit,
51+
current: cur.p95,
52+
previous: prev.p95,
53+
delta,
54+
deltaPercent: Math.round(deltaPercent * 10) / 10,
55+
regression: deltaPercent > REGRESSION_THRESHOLD_PERCENT,
56+
});
57+
}
58+
return deltas;
59+
}
60+
61+
function makeEntry(overrides: Partial<HistoryEntry> & { results: BenchmarkResult[] }): HistoryEntry {
62+
return {
63+
timestamp: "2026-04-09T06:00:00Z",
64+
commitSha: "abc1234567890",
65+
iterations: 30,
66+
regressions: [],
67+
...overrides,
68+
};
69+
}
70+
71+
function makeResult(metric: string, p95: number, unit = "ms"): BenchmarkResult {
72+
return { metric, unit, values: [p95], mean: p95, median: p95, p95, p99: p95 };
73+
}
74+
75+
// ── Tests ─────────────────────────────────────────────────────────
76+
77+
describe("computeDeltas", () => {
78+
it("computes deltas between two runs", () => {
79+
const prev = makeEntry({ results: [makeResult("container_startup_warm", 18000)] });
80+
const curr = makeEntry({ results: [makeResult("container_startup_warm", 13000)] });
81+
const deltas = computeDeltas(curr, prev);
82+
83+
expect(deltas).toHaveLength(1);
84+
expect(deltas[0].metric).toBe("container_startup_warm");
85+
expect(deltas[0].previous).toBe(18000);
86+
expect(deltas[0].current).toBe(13000);
87+
expect(deltas[0].delta).toBe(-5000);
88+
expect(deltas[0].deltaPercent).toBe(-27.8);
89+
expect(deltas[0].regression).toBe(false);
90+
});
91+
92+
it("flags regression when delta exceeds 20%", () => {
93+
const prev = makeEntry({ results: [makeResult("container_startup_warm", 10000)] });
94+
const curr = makeEntry({ results: [makeResult("container_startup_warm", 13000)] });
95+
const deltas = computeDeltas(curr, prev);
96+
97+
expect(deltas[0].deltaPercent).toBe(30);
98+
expect(deltas[0].regression).toBe(true);
99+
});
100+
101+
it("does not flag regression at exactly 20%", () => {
102+
const prev = makeEntry({ results: [makeResult("container_startup_warm", 10000)] });
103+
const curr = makeEntry({ results: [makeResult("container_startup_warm", 12000)] });
104+
const deltas = computeDeltas(curr, prev);
105+
106+
expect(deltas[0].deltaPercent).toBe(20);
107+
expect(deltas[0].regression).toBe(false);
108+
});
109+
110+
it("handles multiple metrics", () => {
111+
const prev = makeEntry({
112+
results: [makeResult("warm", 18000), makeResult("cold", 28000), makeResult("memory", 20, "MB")],
113+
});
114+
const curr = makeEntry({
115+
results: [makeResult("warm", 13000), makeResult("cold", 26000), makeResult("memory", 22, "MB")],
116+
});
117+
const deltas = computeDeltas(curr, prev);
118+
119+
expect(deltas).toHaveLength(3);
120+
expect(deltas[0].metric).toBe("warm");
121+
expect(deltas[1].metric).toBe("cold");
122+
expect(deltas[2].metric).toBe("memory");
123+
});
124+
125+
it("skips metrics missing from previous run", () => {
126+
const prev = makeEntry({ results: [makeResult("warm", 18000)] });
127+
const curr = makeEntry({ results: [makeResult("warm", 13000), makeResult("new_metric", 100)] });
128+
const deltas = computeDeltas(curr, prev);
129+
130+
expect(deltas).toHaveLength(1);
131+
expect(deltas[0].metric).toBe("warm");
132+
});
133+
134+
it("handles zero previous value without division error", () => {
135+
const prev = makeEntry({ results: [makeResult("latency", 0)] });
136+
const curr = makeEntry({ results: [makeResult("latency", 100)] });
137+
const deltas = computeDeltas(curr, prev);
138+
139+
expect(deltas[0].deltaPercent).toBe(0);
140+
expect(deltas[0].regression).toBe(false);
141+
});
142+
143+
it("returns empty array for no matching metrics", () => {
144+
const prev = makeEntry({ results: [makeResult("a", 100)] });
145+
const curr = makeEntry({ results: [makeResult("b", 200)] });
146+
const deltas = computeDeltas(curr, prev);
147+
148+
expect(deltas).toHaveLength(0);
149+
});
150+
});

0 commit comments

Comments
 (0)