Initial pass at improving autoevals compat (#16)

dcramer · web-flow · commit 4a414206d1c5 · 2025-05-05T13:02:47.000-07:00
Correct the signature on scorers to handle unknown inputs, and remove
the expectation of `expected`.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,5 @@
 
 /package-lock.json
 /pnpm-lock.yaml
+
+.env
diff --git a/README.md b/README.md
@@ -176,6 +176,32 @@ export function Factuality(model: LanguageModel) {
 }
 ````
 
+#### Compatibility with `autoevals`
+
+We maintain compatibility with the [autoevals package](https://github.com/braintrustdata/autoevals) from Braintrust. To use it you'll typically need to use te `partial` helper provided on the scorers. For example, with the `ClosedQA` scorer:
+
+```javascript
+import { describeEval } from "vitest-evals";
+import { ClosedQA } from "autoevals";
+
+describeEval("my evals", {
+  data: async () => {
+    // The scenarios you wish to evaluate
+    return [
+      {
+        input: "What is the capital of France?",
+        expected: "Paris",
+      }
+    ];
+  },
+  task: answerQuestion,
+  scorers: [ClosedQA.partial({
+    criteria: "Does the submission indicate that the question is out of scope?",
+  })],
+  threshold: 0.6,
+})
+```
+
 ### Separating Evals
 
 An alternative to `skipIf` for controlling if evals run is creating an separate `vitest` configuration for them. This gives a lot of advantages, particularly allowing you to maintain two completely separate test suites. A good pattern you can enable with this is a filename-based-test selector:
diff --git a/package.json b/package.json
@@ -5,9 +5,7 @@
   "types": "./dist/index.d.ts",
   "main": "./dist/index.js",
   "module": "./dist/index.mjs",
-  "files": [
-    "dist"
-  ],
+  "files": ["dist"],
   "exports": {
     ".": {
       "types": "./dist/index.d.ts",
@@ -27,8 +25,8 @@
     "lint:fix": "biome lint --fix",
     "prepare": "npm run build && simple-git-hooks",
     "postinstall": "simple-git-hooks",
-    "test": "vitest --reporter=./src/reporter.ts",
-    "test:ci": "vitest run --coverage --reporter=./src/reporter.ts --reporter=junit --outputFile=tests.junit.xml"
+    "test": "dotenv -e .env -e .env.local -- vitest --reporter=./src/reporter.ts",
+    "test:ci": "dotenv -e .env -e .env.local -- vitest run --coverage --reporter=./src/reporter.ts --reporter=junit --outputFile=tests.junit.xml"
   },
   "repository": {
     "type": "git",
@@ -49,10 +47,12 @@
     "@biomejs/biome": "^1.9.4",
     "@vitest/coverage-v8": "^3.1.1",
     "autoevals": "^0.0.127",
+    "dotenv-cli": "^8.0.0",
     "lint-staged": "^15.5.0",
+    "openai": "^4.97.0",
     "simple-git-hooks": "^2.12.1",
-    "tsup": "^8.4.0",
     "tinyrainbow": "*",
+    "tsup": "^8.4.0",
     "typescript": "^5.8.3",
     "vitest": "*"
   },
diff --git a/src/autoevals-compatibility.test.ts b/src/autoevals-compatibility.test.ts
@@ -0,0 +1,59 @@
+import { describeEval } from "./index";
+import { init, ClosedQA, Factuality, Levenshtein } from "autoevals";
+import OpenAI from "openai";
+
+// const client = new OpenAI({
+//   apiKey: process.env.OPENAI_API_KEY,
+// })
+
+// init({ client });
+
+// TODO: Whats the easiest way to ensure these tests actually run?
+describeEval("autoevals Levenshtein", {
+  data: async () => [
+    {
+      input: "What is the capital of France?",
+      expected: "Paris",
+    },
+  ],
+  task: async () => {
+    return "Paris";
+  },
+  scorers: [Levenshtein],
+  threshold: 1.0,
+});
+
+describeEval("autoevals Factuality", {
+  data: async () => [
+    {
+      input: "What is the capital of France?",
+      expected: "Paris",
+    },
+  ],
+  task: async () => {
+    return "Paris";
+  },
+  scorers: [Factuality],
+  threshold: 1.0,
+  skipIf: () => !process.env.OPENAI_API_KEY,
+});
+
+describeEval("autoevals ClosedQA", {
+  data: async () => [
+    {
+      input: "What is the capital of France?",
+      expected: "Paris",
+    },
+  ],
+  task: async () => {
+    return "Paris";
+  },
+  scorers: [
+    ClosedQA.partial({
+      criteria:
+        "The submission should indicate the correct city, and nothing else.",
+    }),
+  ],
+  threshold: 1.0,
+  skipIf: () => !process.env.OPENAI_API_KEY,
+});
diff --git a/src/compatibility.test.ts b/src/compatibility.test.ts
diff --git a/src/index.ts b/src/index.ts
@@ -11,11 +11,12 @@ export type Score = {
   };
 };
 
-export type ScoreFn = (opts: {
-  input: string;
-  output: string;
-  expected?: string;
-}) => Promise<Score> | Score;
+export type ScoreFn = (
+  opts: {
+    input: string;
+    output: string;
+  } & Record<string, unknown>,
+) => Promise<Score> | Score;
 
 export type ToEval<R = unknown> = (
   expected: string,
@@ -64,6 +65,7 @@ expect.extend({
    * });
    * ```
    */
+  // TODO: this needs to be support true extensibility with Eval scorers
   toEval: async function toEval(
     input: string,
     expected: string,
@@ -138,7 +140,7 @@ export function describeEval(
   return describe(name, async () => {
     const testFn = skipIf ? test.skipIf(skipIf()) : test;
     // TODO: should data just be a generator?
-    for (const { input, expected } of await data()) {
+    for (const { input, ...params } of await data()) {
       testFn(
         input,
         {
@@ -149,7 +151,7 @@ export function describeEval(
 
           const scores = await Promise.all(
             scorers.map((scorer) => {
-              const result = scorer({ input, expected, output });
+              const result = scorer({ input, ...params, output });
               if (result instanceof Promise) {
                 return result;
               }