Clean up bench comment: remove setup noise and add summary table

Copilot · NullVoxPopuli · Copilot · commit a1123dce9f85 · 2026-03-15T00:03:16.000Z
- Move bench-compare.mjs setup messages (setup, install, CPU pinning) to
  stderr so only mitata output reaches stdout
- Suppress pnpm install stdout in control dir setup
- Remove 2&gt;&amp;1 from workflow pipe so stderr flows to CI logs, not output file
- Add summary table above &lt;details&gt; by parsing JSON bench results
- Add safety-net regex to strip any leaked setup lines from raw output
- Pass BENCH_JSON_OUTPUT env var to format step

Co-authored-by: NullVoxPopuli &lt;199018+NullVoxPopuli@users.noreply.github.com&gt;
diff --git a/.github/workflows/bench-compare.yml b/.github/workflows/bench-compare.yml
@@ -10,7 +10,7 @@ concurrency:
 
 jobs:
   bench-compare:
-    name: "Benchmark Comparison"
+    name: 'Benchmark Comparison'
     runs-on: ubuntu-latest
     permissions:
       pull-requests: write
@@ -30,12 +30,13 @@ jobs:
           BENCH_JSON_OUTPUT: ${{ runner.temp }}/bench-results.json
         run: |
           set -o pipefail
-          pnpm bench:compare 2>&1 | sed 's/\x1b\[[0-9;]*m//g' > "$RUNNER_TEMP/bench-output.txt"
+          pnpm bench:compare | sed 's/\x1b\[[0-9;]*m//g' > "$RUNNER_TEMP/bench-output.txt"
 
       - name: Format PR comment
         if: always()
         env:
           BENCH_OUTPUT_FILE: ${{ runner.temp }}/bench-output.txt
+          BENCH_JSON_OUTPUT: ${{ runner.temp }}/bench-results.json
           BENCH_JOB_SUCCESS: ${{ job.status == 'success' }}
         run: node scripts/format-bench-comment.mjs > "$RUNNER_TEMP/bench-comment.md"
 
diff --git a/scripts/bench-compare.mjs b/scripts/bench-compare.mjs
@@ -58,10 +58,10 @@ function resolveRef(branch) {
 const ROOT = process.cwd();
 const CONTROL_DIR = join(tmpdir(), `bench-control-${BASE_BRANCH}-${Date.now()}`);
 
-console.log(`\n🔧  Setting up control (${BASE_BRANCH}) in ${CONTROL_DIR}\n`);
+console.error(`\n🔧  Setting up control (${BASE_BRANCH}) in ${CONTROL_DIR}\n`);
 
 const BASE_REF = resolveRef(BASE_BRANCH);
-console.log(`   Resolved ${BASE_BRANCH} → ${BASE_REF.slice(0, 10)}\n`);
+console.error(`   Resolved ${BASE_BRANCH} → ${BASE_REF.slice(0, 10)}\n`);
 
 // Clean up temp dir on exit
 function cleanup() {
@@ -87,11 +87,14 @@ try {
   );
 
   // ── 2. Install dependencies in control dir ───────────────────────────────
-  console.log(`\n📦  Installing dependencies for control (${BASE_BRANCH})…\n`);
-  run('pnpm install --frozen-lockfile', { cwd: CONTROL_DIR });
+  console.error(`\n📦  Installing dependencies for control (${BASE_BRANCH})…\n`);
+  run('pnpm install --frozen-lockfile', {
+    cwd: CONTROL_DIR,
+    stdio: ['inherit', 'pipe', 'inherit'],
+  });
 
   // ── 3. Run mitata bench with --control-dir ───────────────────────────────
-  console.log(`\n🏎️  Running benchmarks (experiment vs control)…\n`);
+  console.error(`\n🏎️  Running benchmarks (experiment vs control)…\n`);
 
   const benchScript = join(ROOT, 'tests/parser.bench.mjs');
   const benchArgs = ['--expose-gc', benchScript, '--control-dir', CONTROL_DIR];
@@ -104,7 +107,7 @@ try {
   const fullArgs = HAS_TASKSET ? ['-c', '0', 'node', ...benchArgs] : benchArgs;
 
   if (HAS_TASKSET) {
-    console.log('📌  CPU pinning enabled (taskset -c 0)\n');
+    console.error('📌  CPU pinning enabled (taskset -c 0)\n');
   }
 
   const result = spawnSync(cmd, fullArgs, {
@@ -118,7 +121,7 @@ try {
     process.exit(1);
   }
 
-  console.log('\n✅  Benchmark comparison complete.\n');
+  console.error('\n✅  Benchmark comparison complete.\n');
 } catch (e) {
   console.error('❌  Error:', e.message);
   process.exit(1);
diff --git a/scripts/format-bench-comment.mjs b/scripts/format-bench-comment.mjs
@@ -1,18 +1,25 @@
 /**
  * Format benchmark comparison results into a GitHub PR comment.
  *
- * Reads the plain-text mitata output from bench-compare.mjs and wraps it in a
- * GitHub-flavored markdown comment.
+ * Reads the plain-text mitata output and (optionally) the JSON results from
+ * the bench run, then produces a GitHub-flavored markdown comment with:
+ *   1. A summary table (when comparison data is available)
+ *   2. Full mitata output in a collapsible <details> section
  *
  * Environment variables:
  *   BENCH_OUTPUT_FILE   - Path to the plain-text bench output
+ *   BENCH_JSON_OUTPUT   - Path to the JSON bench results (optional)
  *   BENCH_JOB_SUCCESS   - Set to "true" if the benchmark job succeeded
  */
 
 import { readFileSync } from 'node:fs';
 
 const marker = '<!-- bench-compare -->';
 
+// ---------------------------------------------------------------------------
+// Read raw mitata output
+// ---------------------------------------------------------------------------
+
 let rawOutput;
 try {
   rawOutput = readFileSync(process.env.BENCH_OUTPUT_FILE, 'utf8').trim();
@@ -21,13 +28,99 @@ try {
   rawOutput = '(no output — benchmark may have failed to start)';
 }
 
+// Strip any lines before the mitata header (safety net for leaked setup messages)
+const benchStart = rawOutput.search(/^(clk:|benchmark\b)/m);
+if (benchStart > 0) {
+  rawOutput = rawOutput.slice(benchStart);
+}
+
+// ---------------------------------------------------------------------------
+// Read JSON results (if available) and build summary
+// ---------------------------------------------------------------------------
+
+let summarySection = '';
+const jsonPath = process.env.BENCH_JSON_OUTPUT;
+
+if (jsonPath) {
+  try {
+    const json = JSON.parse(readFileSync(jsonPath, 'utf8'));
+    summarySection = buildSummary(json);
+  } catch {
+    // JSON not available or malformed — skip summary
+  }
+}
+
+function formatTime(ns) {
+  if (ns >= 1e6) return `${(ns / 1e6).toFixed(2)} ms`;
+  if (ns >= 1e3) return `${(ns / 1e3).toFixed(2)} µs`;
+  return `${ns.toFixed(2)} ns`;
+}
+
+function deltaEmoji(pct) {
+  const abs = Math.abs(pct);
+  // negative pct means experiment is faster (lower time = better)
+  if (abs < 1) return '⚪';
+  if (pct <= -5) return '🟢';
+  if (pct >= 5) return '🔴';
+  return '🟡';
+}
+
+function buildSummary(json) {
+  const benchmarks = json.benchmarks || [];
+
+  // In comparison mode, benchmarks come in pairs inside summary groups.
+  // Each benchmark alias is like "gts small (control)" / "gts small (experiment)".
+  // Group them by stripping the suffix.
+  const pairs = new Map();
+
+  for (const trial of benchmarks) {
+    for (const r of trial.runs || []) {
+      if (!r.stats) continue;
+      const m = r.name.match(/^(.+)\s+\((control|experiment)\)$/);
+      if (!m) continue;
+      const [, key, role] = m;
+      if (!pairs.has(key)) pairs.set(key, {});
+      pairs.get(key)[role] = r.stats;
+    }
+  }
+
+  if (pairs.size === 0) return '';
+
+  const rows = [];
+  for (const [name, { control, experiment }] of pairs) {
+    if (!control || !experiment) continue;
+    const delta = ((experiment.avg - control.avg) / control.avg) * 100;
+    const emoji = deltaEmoji(delta);
+    const sign = delta > 0 ? '+' : '';
+    rows.push(
+      `| ${emoji} | ${name} | ${formatTime(control.avg)} | ${formatTime(experiment.avg)} | ${sign}${delta.toFixed(1)}% |`
+    );
+  }
+
+  if (rows.length === 0) return '';
+
+  return [
+    '',
+    '| | Benchmark | Control (avg) | Experiment (avg) | Δ |',
+    '|---|---|---:|---:|---:|',
+    ...rows,
+    '',
+    '> 🟢 faster · 🔴 slower · 🟡 within 5% · ⚪ within 1%',
+    '',
+  ].join('\n');
+}
+
+// ---------------------------------------------------------------------------
+// Assemble comment
+// ---------------------------------------------------------------------------
+
 const success = process.env.BENCH_JOB_SUCCESS === 'true';
 const heading = success ? '## 🏎️ Benchmark Comparison' : '## ❌ Benchmark Comparison (failed)';
 
 const body = [
   marker,
   heading,
-  '',
+  summarySection,
   '<details>',
   '<summary>Full mitata output</summary>',
   '',