build(vitest)!: Drop 3.x and improve eval reporter (#38)

dcramer · codex · web-flow · commit 68bb629399cb · 2026-04-19T09:29:35.000-07:00
Drop Vitest 3.x support and switch the eval reporter to Vitest 4's
verbose reporter path.

The reporter change streams eval cases as they finish instead of
flushing them when the test module ends, while still keeping the eval
score suffix. That implementation depends on the Vitest 4 reporter
surface in `vitest/node`. Vitest 3 uses a materially different verbose
reporter path, so keeping 3.x compatibility here would mean carrying a
second internal code path just for reporter behavior.

This narrows the Vitest peer range to 4.x, removes the deprecated
`vitest/reporters` import, tightens the reporter tests to use real
Vitest state values, explicitly covers failed eval output, and adds a
clarifying comment around the `DefaultReporter` bookkeeping handoff used
for eval cases.

Validated with `pnpm run test -- src/reporter.test.ts` and `pnpm build`.

---------

Co-authored-by: Codex &lt;codex@openai.com&gt;
diff --git a/package.json b/package.json
@@ -5,9 +5,7 @@
   "types": "./dist/index.d.ts",
   "main": "./dist/index.js",
   "module": "./dist/index.mjs",
-  "files": [
-    "dist"
-  ],
+  "files": ["dist"],
   "exports": {
     ".": {
       "types": "./dist/index.d.ts",
@@ -59,7 +57,7 @@
   "peerDependencies": {
     "ai": ">=4 <7",
     "tinyrainbow": ">=2 <4",
-    "vitest": ">=3 <5",
+    "vitest": ">=4 <5",
     "zod": ">=3 <5"
   },
   "peerDependenciesMeta": {
diff --git a/src/reporter.test.ts b/src/reporter.test.ts
@@ -0,0 +1,122 @@
+import { stripVTControlCharacters } from "node:util";
+import { describe, expect, test, vi } from "vitest";
+import DefaultEvalReporter from "./reporter";
+
+function createReporter() {
+  const logger = {
+    log: vi.fn(),
+    error: vi.fn(),
+    printBanner: vi.fn(),
+    printNoTestFound: vi.fn(),
+  };
+
+  const reporter = new DefaultEvalReporter({ isTTY: false });
+  reporter.onInit({
+    logger,
+    config: {
+      hideSkippedTests: false,
+      slowTestThreshold: 300,
+      root: process.cwd(),
+    },
+  } as any);
+
+  return { reporter, logger };
+}
+
+function createTestCase({
+  avgScore,
+  state = "passed",
+}: {
+  avgScore?: number;
+  state?: "passed" | "failed";
+}) {
+  return {
+    task: {
+      name: "streams eval progress",
+      type: "test",
+      mode: "run",
+      file: {
+        name: "fixtures/reporter.eval.test.ts",
+      },
+      result: {
+        state,
+        duration: 42,
+      },
+    },
+    module: {
+      task: {
+        name: "fixtures/reporter.eval.test.ts",
+      },
+      project: {
+        name: "",
+      },
+    },
+    project: {
+      name: "",
+    },
+    options: {},
+    location: {
+      line: 12,
+      column: 3,
+    },
+    meta: () => (avgScore == null ? {} : { eval: { avgScore } }),
+    result: () => ({
+      state,
+      errors: state === "failed" ? [{ message: "threshold not met" }] : [],
+    }),
+    diagnostic: () => ({
+      duration: 42,
+    }),
+    annotations: () => [],
+  };
+}
+
+describe("DefaultEvalReporter", () => {
+  test("streams eval test cases with scores and avoids a file-end flush", () => {
+    const { reporter, logger } = createReporter();
+    const testCase = createTestCase({ avgScore: 0.82 });
+
+    reporter.onTestCaseResult(testCase as any);
+
+    expect(logger.log).toHaveBeenCalledTimes(1);
+    expect(stripVTControlCharacters(logger.log.mock.calls[0][0])).toContain(
+      "fixtures/reporter.eval.test.ts:12:3 > streams eval progress [0.82] 42ms",
+    );
+
+    reporter.onTestModuleEnd({
+      state: () => "passed",
+      task: {},
+    } as any);
+
+    expect(logger.log).toHaveBeenCalledTimes(1);
+  });
+
+  test("falls back to verbose output for non-eval tests", () => {
+    const { reporter, logger } = createReporter();
+
+    reporter.onTestCaseResult(createTestCase({}) as any);
+
+    expect(stripVTControlCharacters(logger.log.mock.calls[0][0])).toContain(
+      "fixtures/reporter.eval.test.ts:12:3 > streams eval progress 42ms",
+    );
+    expect(stripVTControlCharacters(logger.log.mock.calls[0][0])).not.toContain(
+      "[0.",
+    );
+  });
+
+  test("logs failed eval test details with the score suffix", () => {
+    const { reporter, logger } = createReporter();
+
+    reporter.onTestCaseResult(
+      createTestCase({ avgScore: 0.2, state: "failed" }) as any,
+    );
+
+    expect(logger.log).toHaveBeenCalledTimes(2);
+    expect(stripVTControlCharacters(logger.log.mock.calls[0][0])).toContain(
+      "fixtures/reporter.eval.test.ts:12:3 > streams eval progress [0.20] 42ms",
+    );
+    expect(stripVTControlCharacters(logger.log.mock.calls[1][0])).toContain(
+      "threshold not met",
+    );
+  });
+});
diff --git a/src/reporter.ts b/src/reporter.ts
@@ -1,28 +1,58 @@
-// import type { RunnerTask, RunnerTestFile } from "vitest";
-// TODO: Switch to "vitest/node" when we drop Vitest 3 support.
-import { DefaultReporter } from "vitest/reporters";
+import { DefaultReporter, VerboseReporter } from "vitest/node";
 import c from "tinyrainbow";
 
-export default class DefaultEvalReporter extends DefaultReporter {
-  protected override printTestCase(moduleState: any, test: any): void {
-    const meta = test.meta();
-    const testResult = test.result();
+const TEST_NAME_SEPARATOR = c.dim(" > ");
 
+export default class DefaultEvalReporter extends VerboseReporter {
+  override onTestCaseResult(test: any): void {
+    const meta = test.meta();
     if (!meta.eval) {
-      super.printTestCase(moduleState, test);
+      super.onTestCaseResult(test);
       return;
     }
 
-    const padding = this.getTestIndentation(test.task);
-    const icon = testResult.state === "failed" ? c.red("✗ ") : "  ";
+    // Preserve DefaultReporter's bookkeeping without letting VerboseReporter
+    // print the stock per-test line; eval cases need custom score output.
+    DefaultReporter.prototype.onTestCaseResult.call(this, test);
+
+    const testResult = test.result();
+    if (
+      this.ctx.config.hideSkippedTests &&
+      testResult.state === "skipped" &&
+      test.options?.mode !== "todo"
+    ) {
+      return;
+    }
+
+    this.logEvalTestCase(test, meta.eval.avgScore);
+
+    if (testResult.state === "failed") {
+      for (const error of testResult.errors) {
+        this.log(c.red(`   → ${error.message}`));
+      }
+    }
+
+    if (test.annotations().length) {
+      this.log();
+      this.printAnnotations(test, "log", 3);
+      this.log();
+    }
+  }
+
+  private logEvalTestCase(test: any, avgScore: number): void {
     const colorFn =
-      meta.eval.avgScore < 0.5
-        ? c.red
-        : meta.eval.avgScore < 0.75
-          ? c.yellow
-          : c.green;
-    this.log(
-      `${padding}${icon}${this.getTestName(test.task, c.dim(" > "))} [${colorFn(meta.eval.avgScore.toFixed(2))}]`,
-    );
+      avgScore < 0.5 ? c.red : avgScore < 0.75 ? c.yellow : c.green;
+
+    let title = ` ${this.getEntityPrefix(test)} `;
+    title += test.module.task.name;
+    if (test.location) {
+      title += c.dim(`:${test.location.line}:${test.location.column}`);
+    }
+    title += TEST_NAME_SEPARATOR;
+    title += this.getTestName(test.task, TEST_NAME_SEPARATOR);
+    title += ` [${colorFn(avgScore.toFixed(2))}]`;
+    title += this.getTestCaseSuffix(test);
+
+    this.log(title);
   }
 }