Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)

This commit is contained in:
Christian Gunderman
2026-04-08 23:57:26 +00:00
committed by GitHub
parent bc3ed61adb
commit f1bb2af6de
32 changed files with 475 additions and 133 deletions
+28 -2
View File
@@ -5,16 +5,18 @@
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
import {
evalTest,
assertModelHasOutput,
checkModelOutputContent,
} from '../integration-tests/test-helper.js';
} from './test-helper.js';
describe('save_memory', () => {
const TEST_PREFIX = 'Save memory test: ';
const rememberingFavoriteColor = "Agent remembers user's favorite color";
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingFavoriteColor,
prompt: `remember that my favorite color is blue.
@@ -35,6 +37,8 @@ describe('save_memory', () => {
});
const rememberingCommandRestrictions = 'Agent remembers command restrictions';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingCommandRestrictions,
prompt: `I don't want you to ever run npm commands.`,
@@ -54,6 +58,8 @@ describe('save_memory', () => {
const rememberingWorkflow = 'Agent remembers workflow preferences';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingWorkflow,
prompt: `I want you to always lint after building.`,
@@ -74,6 +80,8 @@ describe('save_memory', () => {
const ignoringTemporaryInformation =
'Agent ignores temporary conversation details';
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringTemporaryInformation,
prompt: `I'm going to get a coffee.`,
@@ -97,6 +105,8 @@ describe('save_memory', () => {
const rememberingPetName = "Agent remembers user's pet's name";
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingPetName,
prompt: `Please remember that my dog's name is Buddy.`,
@@ -116,6 +126,8 @@ describe('save_memory', () => {
const rememberingCommandAlias = 'Agent remembers custom command aliases';
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingCommandAlias,
prompt: `When I say 'start server', you should run 'npm run dev'.`,
@@ -136,6 +148,8 @@ describe('save_memory', () => {
const ignoringDbSchemaLocation =
"Agent ignores workspace's database schema location";
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringDbSchemaLocation,
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
assert: async (rig, result) => {
@@ -155,6 +169,8 @@ describe('save_memory', () => {
const rememberingCodingStyle =
"Agent remembers user's coding style preference";
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingCodingStyle,
prompt: `I prefer to use tabs instead of spaces for indentation.`,
@@ -175,6 +191,8 @@ describe('save_memory', () => {
const ignoringBuildArtifactLocation =
'Agent ignores workspace build artifact location';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringBuildArtifactLocation,
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
assert: async (rig, result) => {
@@ -193,6 +211,8 @@ describe('save_memory', () => {
const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringMainEntryPoint,
prompt: `The main entry point for this workspace is \`src/index.js\`.`,
assert: async (rig, result) => {
@@ -211,6 +231,8 @@ describe('save_memory', () => {
const rememberingBirthday = "Agent remembers user's birthday";
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingBirthday,
prompt: `My birthday is on June 15th.`,
@@ -231,6 +253,8 @@ describe('save_memory', () => {
const proactiveMemoryFromLongSession =
'Agent saves preference from earlier in conversation history';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: proactiveMemoryFromLongSession,
params: {
settings: {
@@ -309,6 +333,8 @@ describe('save_memory', () => {
const memoryManagerRoutingPreferences =
'Agent routes global and project preferences to memory';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: memoryManagerRoutingPreferences,
params: {
settings: {