Merge pull request #187 from NullVoxPopuli-ai-agent/increase-benchmark-sampling

NullVoxPopuli · web-flow · commit 266b1e83e26d · 2026-03-20T17:23:30.000-04:00
Increase benchmark sampling 5x for CI stability
diff --git a/package.json b/package.json
@@ -25,7 +25,7 @@
     "lint:js": "eslint . --max-warnings=0",
     "lint:js:fix": "eslint . --fix --max-warnings=0",
     "lint:package": "publint",
-    "bench": "node --expose-gc tests/parser.bench.mjs",
+    "bench": "./scripts/run-bench.sh tests/parser.bench.mjs",
     "bench:compare": "node scripts/bench-compare.mjs",
     "bench:summary": "./scripts/local-bench-summary.sh",
     "test": "vitest run"
diff --git a/scripts/bench-utils.mjs b/scripts/bench-utils.mjs
@@ -0,0 +1,57 @@
+/**
+ * Shared utilities for benchmark formatting scripts.
+ */
+
+import { readFileSync } from 'node:fs';
+
+export function formatTime(ns) {
+  if (ns >= 1e6) return `${(ns / 1e6).toFixed(2)} ms`;
+  if (ns >= 1e3) return `${(ns / 1e3).toFixed(2)} µs`;
+  return `${ns.toFixed(2)} ns`;
+}
+
+export function deltaEmoji(pct) {
+  const abs = Math.abs(pct);
+  if (abs < 2) return '⚪';
+  if (pct <= -5) return '🟢';
+  if (pct >= 5) return '🔴';
+  if (pct < 0) return '🟢';
+  return '🟠';
+}
+
+/**
+ * Parse benchmark JSON results into control/experiment pairs with deltas.
+ * Uses p50 (median) which is more robust to outliers than avg.
+ */
+export function parsePairs(json) {
+  const pairs = new Map();
+
+  for (const trial of json.benchmarks || []) {
+    for (const r of trial.runs || []) {
+      if (!r.stats) continue;
+      const m = r.name.match(/^(.+)\s+\((control|experiment)\)$/);
+      if (!m) continue;
+      const [, key, role] = m;
+      if (!pairs.has(key)) pairs.set(key, {});
+      pairs.get(key)[role] = r.stats;
+    }
+  }
+
+  const rows = [];
+  for (const [name, { control, experiment }] of pairs) {
+    if (!control || !experiment) continue;
+    const ctrlVal = control.p50 ?? control.avg;
+    const expVal = experiment.p50 ?? experiment.avg;
+    const delta = ((expVal - ctrlVal) / ctrlVal) * 100;
+    rows.push({ name, control: ctrlVal, experiment: expVal, delta });
+  }
+
+  return rows;
+}
+
+/**
+ * Read and parse the benchmark JSON results file.
+ */
+export function readBenchJSON(path) {
+  return JSON.parse(readFileSync(path, 'utf8'));
+}
diff --git a/scripts/format-bench-cli.mjs b/scripts/format-bench-cli.mjs
@@ -6,7 +6,7 @@
  *   BENCH_JSON_OUTPUT - Path to the JSON bench results
  */
 
-import { readFileSync } from 'node:fs';
+import { formatTime, deltaEmoji, parsePairs, readBenchJSON } from './bench-utils.mjs';
 
 const jsonPath = process.env.BENCH_JSON_OUTPUT;
 
@@ -18,67 +18,13 @@ if (!jsonPath) {
 let json;
 
 try {
-  json = JSON.parse(readFileSync(jsonPath, 'utf8'));
+  json = readBenchJSON(jsonPath);
 } catch (e) {
   console.error(`Could not read ${jsonPath}: ${e.message}`);
   process.exit(1);
 }
 
-function formatTime(ns) {
-  if (ns >= 1e6) return `${(ns / 1e6).toFixed(2)} ms`;
-  if (ns >= 1e3) return `${(ns / 1e3).toFixed(2)} µs`;
-
-  return `${ns.toFixed(2)} ns`;
-}
-
-function deltaEmoji(pct) {
-  const abs = Math.abs(pct);
-
-  if (abs < 2) return '⚪';
-  if (pct <= -5) return '🟢';
-  if (pct >= 5) return '🔴';
-  if (pct < 0) return '🟢';
-
-  return '🟠';
-}
-
-// Group control/experiment pairs
-const pairs = new Map();
-
-for (const trial of json.benchmarks || []) {
-  for (const r of trial.runs || []) {
-    if (!r.stats) continue;
-
-    const m = r.name.match(/^(.+)\s+\((control|experiment)\)$/);
-
-    if (!m) continue;
-
-    const [, key, role] = m;
-
-    if (!pairs.has(key)) pairs.set(key, {});
-
-    pairs.get(key)[role] = r.stats;
-  }
-}
-
-if (pairs.size === 0) {
-  console.log('No comparison data found.');
-  process.exit(0);
-}
-
-// Build rows — use median (p50) which is far more robust to outliers from
-// CPU frequency scaling, GC pauses, and other system noise than the mean.
-const rows = [];
-
-for (const [name, { control, experiment }] of pairs) {
-  if (!control || !experiment) continue;
-
-  const ctrlVal = control.p50 ?? control.avg;
-  const expVal = experiment.p50 ?? experiment.avg;
-  const delta = ((expVal - ctrlVal) / ctrlVal) * 100;
-
-  rows.push({ name, control: ctrlVal, experiment: expVal, delta });
-}
+const rows = parsePairs(json);
 
 if (rows.length === 0) {
   console.log('No comparison data found.');
@@ -96,7 +42,6 @@ const deltaW = Math.max(
   'Δ'.length,
   ...rows.map((r) => {
     const sign = r.delta > 0 ? '+' : '';
-
     return `${sign}${r.delta.toFixed(1)}%`.length;
   })
 );
diff --git a/scripts/format-bench-comment.mjs b/scripts/format-bench-comment.mjs
@@ -13,6 +13,7 @@
  */
 
 import { readFileSync } from 'node:fs';
+import { formatTime, deltaEmoji, parsePairs, readBenchJSON } from './bench-utils.mjs';
 
 const marker = '<!-- bench-compare -->';
 
@@ -43,74 +44,30 @@ const jsonPath = process.env.BENCH_JSON_OUTPUT;
 
 if (jsonPath) {
   try {
-    const json = JSON.parse(readFileSync(jsonPath, 'utf8'));
-    summarySection = buildSummary(json);
+    const rows = parsePairs(readBenchJSON(jsonPath));
+
+    if (rows.length > 0) {
+      const tableRows = rows.map(({ name, control, experiment, delta }) => {
+        const emoji = deltaEmoji(delta);
+        const sign = delta > 0 ? '+' : '';
+        return `| ${emoji} | ${name} | ${formatTime(control)} | ${formatTime(experiment)} | ${sign}${delta.toFixed(1)}% |`;
+      });
+
+      summarySection = [
+        '',
+        '| | Benchmark | Control (p50) | Experiment (p50) | Δ |',
+        '|---|---|---:|---:|---:|',
+        ...tableRows,
+        '',
+        '> 🟢 faster · 🔴 slower · 🟠 slightly slower · ⚪ within 2%',
+        '',
+      ].join('\n');
+    }
   } catch {
     // JSON not available or malformed — skip summary
   }
 }
 
-function formatTime(ns) {
-  if (ns >= 1e6) return `${(ns / 1e6).toFixed(2)} ms`;
-  if (ns >= 1e3) return `${(ns / 1e3).toFixed(2)} µs`;
-  return `${ns.toFixed(2)} ns`;
-}
-
-function deltaEmoji(pct) {
-  const abs = Math.abs(pct);
-  // negative pct means experiment is faster (lower time = better)
-  if (abs < 2) return '⚪';
-  if (pct <= -5) return '🟢';
-  if (pct >= 5) return '🔴';
-  if (pct < 0) return '🟢';
-  return '🟠';
-}
-
-function buildSummary(json) {
-  const benchmarks = json.benchmarks || [];
-
-  // In comparison mode, benchmarks come in pairs inside summary groups.
-  // Each benchmark alias is like "gts small (control)" / "gts small (experiment)".
-  // Group them by stripping the suffix.
-  const pairs = new Map();
-
-  for (const trial of benchmarks) {
-    for (const r of trial.runs || []) {
-      if (!r.stats) continue;
-      const m = r.name.match(/^(.+)\s+\((control|experiment)\)$/);
-      if (!m) continue;
-      const [, key, role] = m;
-      if (!pairs.has(key)) pairs.set(key, {});
-      pairs.get(key)[role] = r.stats;
-    }
-  }
-
-  if (pairs.size === 0) return '';
-
-  const rows = [];
-  for (const [name, { control, experiment }] of pairs) {
-    if (!control || !experiment) continue;
-    const delta = ((experiment.avg - control.avg) / control.avg) * 100;
-    const emoji = deltaEmoji(delta);
-    const sign = delta > 0 ? '+' : '';
-    rows.push(
-      `| ${emoji} | ${name} | ${formatTime(control.avg)} | ${formatTime(experiment.avg)} | ${sign}${delta.toFixed(1)}% |`
-    );
-  }
-
-  if (rows.length === 0) return '';
-
-  return [
-    '',
-    '| | Benchmark | Control (avg) | Experiment (avg) | Δ |',
-    '|---|---|---:|---:|---:|',
-    ...rows,
-    '',
-    '> 🟢 faster · 🔴 slower · 🟠 slightly slower · ⚪ within 2%',
-    '',
-  ].join('\n');
-}
-
 // ---------------------------------------------------------------------------
 // Assemble comment
 // ---------------------------------------------------------------------------
diff --git a/scripts/local-bench-summary.sh b/scripts/local-bench-summary.sh
@@ -1,39 +1,48 @@
 #!/usr/bin/env bash
 
-# Check CPU tuning on Linux — poor settings cause massive variance
-hw_warnings=""
+export BENCH_JSON_OUTPUT=./bench-results.json
+
+pnpm bench:compare
+
+echo ""
+echo "━━━ Summary ━━━"
+node scripts/format-bench-cli.mjs
+
+# Print tips for reducing variance
+echo "━━━ Tips for more reliable results ━━━"
+echo ""
+
+tips=()
 
 if [ -f /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then
   gov=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)
   if [ "$gov" != "performance" ]; then
-    hw_warnings+="⚠️  CPU governor is '$gov' — benchmark results will be noisy.
-   Fix with: sudo cpupower frequency-set -g performance
-"
+    tips+=("CPU governor is '$gov' — set to 'performance' for fixed frequency:")
+    tips+=("  sudo cpupower frequency-set -g performance")
+    tips+=("")
   fi
 fi
 
 if [ -f /sys/devices/system/cpu/cpufreq/boost ]; then
   boost=$(cat /sys/devices/system/cpu/cpufreq/boost)
   if [ "$boost" = "1" ]; then
-    hw_warnings+="⚠️  CPU boost is enabled — frequency varies with thermals.
-   Fix with: echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
-"
+    tips+=("CPU boost is enabled — disable to prevent thermal-dependent frequency:")
+    tips+=("  echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost")
+    tips+=("")
+  fi
+elif [ -f /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
+  no_turbo=$(cat /sys/devices/system/cpu/intel_pstate/no_turbo)
+  if [ "$no_turbo" = "0" ]; then
+    tips+=("Intel Turbo Boost is enabled — disable to prevent thermal-dependent frequency:")
+    tips+=("  echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo")
+    tips+=("")
   fi
 fi
 
-if [ -n "$hw_warnings" ]; then
-  echo ""
-  echo "$hw_warnings"
-fi
-
-export BENCH_JSON_OUTPUT=./bench-results.json
-
-pnpm bench:compare
+tips+=("Close other applications to reduce CPU contention")
+tips+=("Run multiple times — if deltas flip sign between runs, they're noise")
 
+for tip in "${tips[@]}"; do
+  echo "  $tip"
+done
 echo ""
-echo "━━━ Summary ━━━"
-node scripts/format-bench-cli.mjs
-
-if [ -n "$hw_warnings" ]; then
-  echo "$hw_warnings"
-fi
diff --git a/scripts/run-bench.sh b/scripts/run-bench.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+#
+# Wrapper that runs a node command with CPU pinning when available.
+#
+# Usage: ./scripts/run-bench.sh <node args...>
+
+set -euo pipefail
+
+CMD=(node --expose-gc "$@")
+
+# CPU pinning on Linux — keep the process on a single core
+if command -v taskset &>/dev/null; then
+  CMD=(taskset -c 0 "${CMD[@]}")
+  echo "📌  CPU pinning enabled (taskset -c 0)" >&2
+  echo "" >&2
+fi
+
+exec "${CMD[@]}"
diff --git a/tests/parser.bench.mjs b/tests/parser.bench.mjs
@@ -15,7 +15,7 @@ import { createRequire } from 'node:module';
 import { readFileSync } from 'node:fs';
 import { fileURLToPath } from 'node:url';
 import { resolve } from 'node:path';
-import { run, bench, boxplot, summary } from 'mitata';
+import { run, bench, boxplot, summary, do_not_optimize as doNotOptimize } from 'mitata';
 
 // ---------------------------------------------------------------------------
 // CLI args
@@ -111,10 +111,16 @@ for (const { type, ext, experimentParse, controlParse } of PARSERS) {
 
 globalThis.gc?.();
 
+// More iterations per sample → individual GC spikes get diluted, reducing
+// variance on noisy CI runners.  Scale down for larger fixtures so each
+// sample doesn't take too long (mitata needs many samples for stable stats).
+const BENCH_ITERS = { small: 1000, medium: 500, large: 100 };
+
 for (const { type, ext, experimentParse, controlParse } of PARSERS) {
   for (const size of SIZES) {
     const code = FIXTURES[type][size];
     const opts = { ...PARSE_OPTIONS, filePath: `${size}${ext}` };
+    const iters = BENCH_ITERS[size];
 
     // Force a full GC before each benchmark group to reduce GC-triggered variance
     globalThis.gc?.();
@@ -123,13 +129,19 @@ for (const { type, ext, experimentParse, controlParse } of PARSERS) {
       // Side-by-side comparison with boxplots
       boxplot(() => {
         summary(() => {
-          bench(`${type} ${size} (control)`, () => controlParse(code, opts));
-          bench(`${type} ${size} (experiment)`, () => experimentParse(code, opts));
+          bench(`${type} ${size} (control)`, () => {
+            for (let i = 0; i < iters; i++) doNotOptimize(controlParse(code, opts));
+          });
+          bench(`${type} ${size} (experiment)`, () => {
+            for (let i = 0; i < iters; i++) doNotOptimize(experimentParse(code, opts));
+          });
         });
       });
     } else {
       // Standalone mode — just benchmark the local parsers
-      bench(`${type} ${size}`, () => experimentParse(code, opts));
+      bench(`${type} ${size}`, () => {
+        for (let i = 0; i < iters; i++) doNotOptimize(experimentParse(code, opts));
+      });
     }
   }
 }