Skip to content

Commit 266b1e8

Browse files
Merge pull request #187 from NullVoxPopuli-ai-agent/increase-benchmark-sampling
Increase benchmark sampling 5x for CI stability
2 parents f204d82 + 00417d1 commit 266b1e8

7 files changed

Lines changed: 146 additions & 148 deletions

File tree

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"lint:js": "eslint . --max-warnings=0",
2626
"lint:js:fix": "eslint . --fix --max-warnings=0",
2727
"lint:package": "publint",
28-
"bench": "node --expose-gc tests/parser.bench.mjs",
28+
"bench": "./scripts/run-bench.sh tests/parser.bench.mjs",
2929
"bench:compare": "node scripts/bench-compare.mjs",
3030
"bench:summary": "./scripts/local-bench-summary.sh",
3131
"test": "vitest run"

scripts/bench-utils.mjs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/**
2+
* Shared utilities for benchmark formatting scripts.
3+
*/
4+
5+
import { readFileSync } from 'node:fs';
6+
7+
export function formatTime(ns) {
8+
if (ns >= 1e6) return `${(ns / 1e6).toFixed(2)} ms`;
9+
if (ns >= 1e3) return `${(ns / 1e3).toFixed(2)} µs`;
10+
return `${ns.toFixed(2)} ns`;
11+
}
12+
13+
export function deltaEmoji(pct) {
14+
const abs = Math.abs(pct);
15+
if (abs < 2) return '⚪';
16+
if (pct <= -5) return '🟢';
17+
if (pct >= 5) return '🔴';
18+
if (pct < 0) return '🟢';
19+
return '🟠';
20+
}
21+
22+
/**
23+
* Parse benchmark JSON results into control/experiment pairs with deltas.
24+
* Uses p50 (median) which is more robust to outliers than avg.
25+
*/
26+
export function parsePairs(json) {
27+
const pairs = new Map();
28+
29+
for (const trial of json.benchmarks || []) {
30+
for (const r of trial.runs || []) {
31+
if (!r.stats) continue;
32+
const m = r.name.match(/^(.+)\s+\((control|experiment)\)$/);
33+
if (!m) continue;
34+
const [, key, role] = m;
35+
if (!pairs.has(key)) pairs.set(key, {});
36+
pairs.get(key)[role] = r.stats;
37+
}
38+
}
39+
40+
const rows = [];
41+
for (const [name, { control, experiment }] of pairs) {
42+
if (!control || !experiment) continue;
43+
const ctrlVal = control.p50 ?? control.avg;
44+
const expVal = experiment.p50 ?? experiment.avg;
45+
const delta = ((expVal - ctrlVal) / ctrlVal) * 100;
46+
rows.push({ name, control: ctrlVal, experiment: expVal, delta });
47+
}
48+
49+
return rows;
50+
}
51+
52+
/**
53+
* Read and parse the benchmark JSON results file.
54+
*/
55+
export function readBenchJSON(path) {
56+
return JSON.parse(readFileSync(path, 'utf8'));
57+
}

scripts/format-bench-cli.mjs

Lines changed: 3 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* BENCH_JSON_OUTPUT - Path to the JSON bench results
77
*/
88

9-
import { readFileSync } from 'node:fs';
9+
import { formatTime, deltaEmoji, parsePairs, readBenchJSON } from './bench-utils.mjs';
1010

1111
const jsonPath = process.env.BENCH_JSON_OUTPUT;
1212

@@ -18,67 +18,13 @@ if (!jsonPath) {
1818
let json;
1919

2020
try {
21-
json = JSON.parse(readFileSync(jsonPath, 'utf8'));
21+
json = readBenchJSON(jsonPath);
2222
} catch (e) {
2323
console.error(`Could not read ${jsonPath}: ${e.message}`);
2424
process.exit(1);
2525
}
2626

27-
function formatTime(ns) {
28-
if (ns >= 1e6) return `${(ns / 1e6).toFixed(2)} ms`;
29-
if (ns >= 1e3) return `${(ns / 1e3).toFixed(2)} µs`;
30-
31-
return `${ns.toFixed(2)} ns`;
32-
}
33-
34-
function deltaEmoji(pct) {
35-
const abs = Math.abs(pct);
36-
37-
if (abs < 2) return '⚪';
38-
if (pct <= -5) return '🟢';
39-
if (pct >= 5) return '🔴';
40-
if (pct < 0) return '🟢';
41-
42-
return '🟠';
43-
}
44-
45-
// Group control/experiment pairs
46-
const pairs = new Map();
47-
48-
for (const trial of json.benchmarks || []) {
49-
for (const r of trial.runs || []) {
50-
if (!r.stats) continue;
51-
52-
const m = r.name.match(/^(.+)\s+\((control|experiment)\)$/);
53-
54-
if (!m) continue;
55-
56-
const [, key, role] = m;
57-
58-
if (!pairs.has(key)) pairs.set(key, {});
59-
60-
pairs.get(key)[role] = r.stats;
61-
}
62-
}
63-
64-
if (pairs.size === 0) {
65-
console.log('No comparison data found.');
66-
process.exit(0);
67-
}
68-
69-
// Build rows — use median (p50) which is far more robust to outliers from
70-
// CPU frequency scaling, GC pauses, and other system noise than the mean.
71-
const rows = [];
72-
73-
for (const [name, { control, experiment }] of pairs) {
74-
if (!control || !experiment) continue;
75-
76-
const ctrlVal = control.p50 ?? control.avg;
77-
const expVal = experiment.p50 ?? experiment.avg;
78-
const delta = ((expVal - ctrlVal) / ctrlVal) * 100;
79-
80-
rows.push({ name, control: ctrlVal, experiment: expVal, delta });
81-
}
27+
const rows = parsePairs(json);
8228

8329
if (rows.length === 0) {
8430
console.log('No comparison data found.');
@@ -96,7 +42,6 @@ const deltaW = Math.max(
9642
'Δ'.length,
9743
...rows.map((r) => {
9844
const sign = r.delta > 0 ? '+' : '';
99-
10045
return `${sign}${r.delta.toFixed(1)}%`.length;
10146
})
10247
);

scripts/format-bench-comment.mjs

Lines changed: 20 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
*/
1414

1515
import { readFileSync } from 'node:fs';
16+
import { formatTime, deltaEmoji, parsePairs, readBenchJSON } from './bench-utils.mjs';
1617

1718
const marker = '<!-- bench-compare -->';
1819

@@ -43,74 +44,30 @@ const jsonPath = process.env.BENCH_JSON_OUTPUT;
4344

4445
if (jsonPath) {
4546
try {
46-
const json = JSON.parse(readFileSync(jsonPath, 'utf8'));
47-
summarySection = buildSummary(json);
47+
const rows = parsePairs(readBenchJSON(jsonPath));
48+
49+
if (rows.length > 0) {
50+
const tableRows = rows.map(({ name, control, experiment, delta }) => {
51+
const emoji = deltaEmoji(delta);
52+
const sign = delta > 0 ? '+' : '';
53+
return `| ${emoji} | ${name} | ${formatTime(control)} | ${formatTime(experiment)} | ${sign}${delta.toFixed(1)}% |`;
54+
});
55+
56+
summarySection = [
57+
'',
58+
'| | Benchmark | Control (p50) | Experiment (p50) | Δ |',
59+
'|---|---|---:|---:|---:|',
60+
...tableRows,
61+
'',
62+
'> 🟢 faster · 🔴 slower · 🟠 slightly slower · ⚪ within 2%',
63+
'',
64+
].join('\n');
65+
}
4866
} catch {
4967
// JSON not available or malformed — skip summary
5068
}
5169
}
5270

53-
function formatTime(ns) {
54-
if (ns >= 1e6) return `${(ns / 1e6).toFixed(2)} ms`;
55-
if (ns >= 1e3) return `${(ns / 1e3).toFixed(2)} µs`;
56-
return `${ns.toFixed(2)} ns`;
57-
}
58-
59-
function deltaEmoji(pct) {
60-
const abs = Math.abs(pct);
61-
// negative pct means experiment is faster (lower time = better)
62-
if (abs < 2) return '⚪';
63-
if (pct <= -5) return '🟢';
64-
if (pct >= 5) return '🔴';
65-
if (pct < 0) return '🟢';
66-
return '🟠';
67-
}
68-
69-
function buildSummary(json) {
70-
const benchmarks = json.benchmarks || [];
71-
72-
// In comparison mode, benchmarks come in pairs inside summary groups.
73-
// Each benchmark alias is like "gts small (control)" / "gts small (experiment)".
74-
// Group them by stripping the suffix.
75-
const pairs = new Map();
76-
77-
for (const trial of benchmarks) {
78-
for (const r of trial.runs || []) {
79-
if (!r.stats) continue;
80-
const m = r.name.match(/^(.+)\s+\((control|experiment)\)$/);
81-
if (!m) continue;
82-
const [, key, role] = m;
83-
if (!pairs.has(key)) pairs.set(key, {});
84-
pairs.get(key)[role] = r.stats;
85-
}
86-
}
87-
88-
if (pairs.size === 0) return '';
89-
90-
const rows = [];
91-
for (const [name, { control, experiment }] of pairs) {
92-
if (!control || !experiment) continue;
93-
const delta = ((experiment.avg - control.avg) / control.avg) * 100;
94-
const emoji = deltaEmoji(delta);
95-
const sign = delta > 0 ? '+' : '';
96-
rows.push(
97-
`| ${emoji} | ${name} | ${formatTime(control.avg)} | ${formatTime(experiment.avg)} | ${sign}${delta.toFixed(1)}% |`
98-
);
99-
}
100-
101-
if (rows.length === 0) return '';
102-
103-
return [
104-
'',
105-
'| | Benchmark | Control (avg) | Experiment (avg) | Δ |',
106-
'|---|---|---:|---:|---:|',
107-
...rows,
108-
'',
109-
'> 🟢 faster · 🔴 slower · 🟠 slightly slower · ⚪ within 2%',
110-
'',
111-
].join('\n');
112-
}
113-
11471
// ---------------------------------------------------------------------------
11572
// Assemble comment
11673
// ---------------------------------------------------------------------------

scripts/local-bench-summary.sh

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,48 @@
11
#!/usr/bin/env bash
22

3-
# Check CPU tuning on Linux — poor settings cause massive variance
4-
hw_warnings=""
3+
export BENCH_JSON_OUTPUT=./bench-results.json
4+
5+
pnpm bench:compare
6+
7+
echo ""
8+
echo "━━━ Summary ━━━"
9+
node scripts/format-bench-cli.mjs
10+
11+
# Print tips for reducing variance
12+
echo "━━━ Tips for more reliable results ━━━"
13+
echo ""
14+
15+
tips=()
516

617
if [ -f /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then
718
gov=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)
819
if [ "$gov" != "performance" ]; then
9-
hw_warnings+="⚠️ CPU governor is '$gov' — benchmark results will be noisy.
10-
Fix with: sudo cpupower frequency-set -g performance
11-
"
20+
tips+=("CPU governor is '$gov' — set to 'performance' for fixed frequency:")
21+
tips+=(" sudo cpupower frequency-set -g performance")
22+
tips+=("")
1223
fi
1324
fi
1425

1526
if [ -f /sys/devices/system/cpu/cpufreq/boost ]; then
1627
boost=$(cat /sys/devices/system/cpu/cpufreq/boost)
1728
if [ "$boost" = "1" ]; then
18-
hw_warnings+="⚠️ CPU boost is enabled — frequency varies with thermals.
19-
Fix with: echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
20-
"
29+
tips+=("CPU boost is enabled — disable to prevent thermal-dependent frequency:")
30+
tips+=(" echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost")
31+
tips+=("")
32+
fi
33+
elif [ -f /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
34+
no_turbo=$(cat /sys/devices/system/cpu/intel_pstate/no_turbo)
35+
if [ "$no_turbo" = "0" ]; then
36+
tips+=("Intel Turbo Boost is enabled — disable to prevent thermal-dependent frequency:")
37+
tips+=(" echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo")
38+
tips+=("")
2139
fi
2240
fi
2341

24-
if [ -n "$hw_warnings" ]; then
25-
echo ""
26-
echo "$hw_warnings"
27-
fi
28-
29-
export BENCH_JSON_OUTPUT=./bench-results.json
30-
31-
pnpm bench:compare
42+
tips+=("Close other applications to reduce CPU contention")
43+
tips+=("Run multiple times — if deltas flip sign between runs, they're noise")
3244

45+
for tip in "${tips[@]}"; do
46+
echo " $tip"
47+
done
3348
echo ""
34-
echo "━━━ Summary ━━━"
35-
node scripts/format-bench-cli.mjs
36-
37-
if [ -n "$hw_warnings" ]; then
38-
echo "$hw_warnings"
39-
fi

scripts/run-bench.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/usr/bin/env bash
2+
#
3+
# Wrapper that runs a node command with CPU pinning when available.
4+
#
5+
# Usage: ./scripts/run-bench.sh <node args...>
6+
7+
set -euo pipefail
8+
9+
CMD=(node --expose-gc "$@")
10+
11+
# CPU pinning on Linux — keep the process on a single core
12+
if command -v taskset &>/dev/null; then
13+
CMD=(taskset -c 0 "${CMD[@]}")
14+
echo "📌 CPU pinning enabled (taskset -c 0)" >&2
15+
echo "" >&2
16+
fi
17+
18+
exec "${CMD[@]}"

tests/parser.bench.mjs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import { createRequire } from 'node:module';
1515
import { readFileSync } from 'node:fs';
1616
import { fileURLToPath } from 'node:url';
1717
import { resolve } from 'node:path';
18-
import { run, bench, boxplot, summary } from 'mitata';
18+
import { run, bench, boxplot, summary, do_not_optimize as doNotOptimize } from 'mitata';
1919

2020
// ---------------------------------------------------------------------------
2121
// CLI args
@@ -111,10 +111,16 @@ for (const { type, ext, experimentParse, controlParse } of PARSERS) {
111111

112112
globalThis.gc?.();
113113

114+
// More iterations per sample → individual GC spikes get diluted, reducing
115+
// variance on noisy CI runners. Scale down for larger fixtures so each
116+
// sample doesn't take too long (mitata needs many samples for stable stats).
117+
const BENCH_ITERS = { small: 1000, medium: 500, large: 100 };
118+
114119
for (const { type, ext, experimentParse, controlParse } of PARSERS) {
115120
for (const size of SIZES) {
116121
const code = FIXTURES[type][size];
117122
const opts = { ...PARSE_OPTIONS, filePath: `${size}${ext}` };
123+
const iters = BENCH_ITERS[size];
118124

119125
// Force a full GC before each benchmark group to reduce GC-triggered variance
120126
globalThis.gc?.();
@@ -123,13 +129,19 @@ for (const { type, ext, experimentParse, controlParse } of PARSERS) {
123129
// Side-by-side comparison with boxplots
124130
boxplot(() => {
125131
summary(() => {
126-
bench(`${type} ${size} (control)`, () => controlParse(code, opts));
127-
bench(`${type} ${size} (experiment)`, () => experimentParse(code, opts));
132+
bench(`${type} ${size} (control)`, () => {
133+
for (let i = 0; i < iters; i++) doNotOptimize(controlParse(code, opts));
134+
});
135+
bench(`${type} ${size} (experiment)`, () => {
136+
for (let i = 0; i < iters; i++) doNotOptimize(experimentParse(code, opts));
137+
});
128138
});
129139
});
130140
} else {
131141
// Standalone mode — just benchmark the local parsers
132-
bench(`${type} ${size}`, () => experimentParse(code, opts));
142+
bench(`${type} ${size}`, () => {
143+
for (let i = 0; i < iters; i++) doNotOptimize(experimentParse(code, opts));
144+
});
133145
}
134146
}
135147
}

0 commit comments

Comments
 (0)