Skip to content

Commit 4a41420

Browse files
authored
Initial pass at improving autoevals compat (#16)
Correct the signature on scorers to handle unknown inputs, and remove the expectation of `expected`.
1 parent abec4fe commit 4a41420

6 files changed

Lines changed: 102 additions & 30 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@
99

1010
/package-lock.json
1111
/pnpm-lock.yaml
12+
13+
.env

README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,32 @@ export function Factuality(model: LanguageModel) {
176176
}
177177
````
178178

179+
#### Compatibility with `autoevals`
180+
181+
We maintain compatibility with the [autoevals package](https://github.com/braintrustdata/autoevals) from Braintrust. To use it you'll typically need to use te `partial` helper provided on the scorers. For example, with the `ClosedQA` scorer:
182+
183+
```javascript
184+
import { describeEval } from "vitest-evals";
185+
import { ClosedQA } from "autoevals";
186+
187+
describeEval("my evals", {
188+
data: async () => {
189+
// The scenarios you wish to evaluate
190+
return [
191+
{
192+
input: "What is the capital of France?",
193+
expected: "Paris",
194+
}
195+
];
196+
},
197+
task: answerQuestion,
198+
scorers: [ClosedQA.partial({
199+
criteria: "Does the submission indicate that the question is out of scope?",
200+
})],
201+
threshold: 0.6,
202+
})
203+
```
204+
179205
### Separating Evals
180206

181207
An alternative to `skipIf` for controlling if evals run is creating an separate `vitest` configuration for them. This gives a lot of advantages, particularly allowing you to maintain two completely separate test suites. A good pattern you can enable with this is a filename-based-test selector:

package.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55
"types": "./dist/index.d.ts",
66
"main": "./dist/index.js",
77
"module": "./dist/index.mjs",
8-
"files": [
9-
"dist"
10-
],
8+
"files": ["dist"],
119
"exports": {
1210
".": {
1311
"types": "./dist/index.d.ts",
@@ -27,8 +25,8 @@
2725
"lint:fix": "biome lint --fix",
2826
"prepare": "npm run build && simple-git-hooks",
2927
"postinstall": "simple-git-hooks",
30-
"test": "vitest --reporter=./src/reporter.ts",
31-
"test:ci": "vitest run --coverage --reporter=./src/reporter.ts --reporter=junit --outputFile=tests.junit.xml"
28+
"test": "dotenv -e .env -e .env.local -- vitest --reporter=./src/reporter.ts",
29+
"test:ci": "dotenv -e .env -e .env.local -- vitest run --coverage --reporter=./src/reporter.ts --reporter=junit --outputFile=tests.junit.xml"
3230
},
3331
"repository": {
3432
"type": "git",
@@ -49,10 +47,12 @@
4947
"@biomejs/biome": "^1.9.4",
5048
"@vitest/coverage-v8": "^3.1.1",
5149
"autoevals": "^0.0.127",
50+
"dotenv-cli": "^8.0.0",
5251
"lint-staged": "^15.5.0",
52+
"openai": "^4.97.0",
5353
"simple-git-hooks": "^2.12.1",
54-
"tsup": "^8.4.0",
5554
"tinyrainbow": "*",
55+
"tsup": "^8.4.0",
5656
"typescript": "^5.8.3",
5757
"vitest": "*"
5858
},
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import { describeEval } from "./index";
2+
import { init, ClosedQA, Factuality, Levenshtein } from "autoevals";
3+
import OpenAI from "openai";
4+
5+
// const client = new OpenAI({
6+
// apiKey: process.env.OPENAI_API_KEY,
7+
// })
8+
9+
// init({ client });
10+
11+
// TODO: Whats the easiest way to ensure these tests actually run?
12+
describeEval("autoevals Levenshtein", {
13+
data: async () => [
14+
{
15+
input: "What is the capital of France?",
16+
expected: "Paris",
17+
},
18+
],
19+
task: async () => {
20+
return "Paris";
21+
},
22+
scorers: [Levenshtein],
23+
threshold: 1.0,
24+
});
25+
26+
describeEval("autoevals Factuality", {
27+
data: async () => [
28+
{
29+
input: "What is the capital of France?",
30+
expected: "Paris",
31+
},
32+
],
33+
task: async () => {
34+
return "Paris";
35+
},
36+
scorers: [Factuality],
37+
threshold: 1.0,
38+
skipIf: () => !process.env.OPENAI_API_KEY,
39+
});
40+
41+
describeEval("autoevals ClosedQA", {
42+
data: async () => [
43+
{
44+
input: "What is the capital of France?",
45+
expected: "Paris",
46+
},
47+
],
48+
task: async () => {
49+
return "Paris";
50+
},
51+
scorers: [
52+
ClosedQA.partial({
53+
criteria:
54+
"The submission should indicate the correct city, and nothing else.",
55+
}),
56+
],
57+
threshold: 1.0,
58+
skipIf: () => !process.env.OPENAI_API_KEY,
59+
});

src/compatibility.test.ts

Lines changed: 0 additions & 17 deletions
This file was deleted.

src/index.ts

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ export type Score = {
1111
};
1212
};
1313

14-
export type ScoreFn = (opts: {
15-
input: string;
16-
output: string;
17-
expected?: string;
18-
}) => Promise<Score> | Score;
14+
export type ScoreFn = (
15+
opts: {
16+
input: string;
17+
output: string;
18+
} & Record<string, unknown>,
19+
) => Promise<Score> | Score;
1920

2021
export type ToEval<R = unknown> = (
2122
expected: string,
@@ -64,6 +65,7 @@ expect.extend({
6465
* });
6566
* ```
6667
*/
68+
// TODO: this needs to be support true extensibility with Eval scorers
6769
toEval: async function toEval(
6870
input: string,
6971
expected: string,
@@ -138,7 +140,7 @@ export function describeEval(
138140
return describe(name, async () => {
139141
const testFn = skipIf ? test.skipIf(skipIf()) : test;
140142
// TODO: should data just be a generator?
141-
for (const { input, expected } of await data()) {
143+
for (const { input, ...params } of await data()) {
142144
testFn(
143145
input,
144146
{
@@ -149,7 +151,7 @@ export function describeEval(
149151

150152
const scores = await Promise.all(
151153
scorers.map((scorer) => {
152-
const result = scorer({ input, expected, output });
154+
const result = scorer({ input, ...params, output });
153155
if (result instanceof Promise) {
154156
return result;
155157
}

0 commit comments

Comments
 (0)