mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-20 10:10:56 -07:00
Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)
This commit is contained in:
committed by
GitHub
parent
bc3ed61adb
commit
f1bb2af6de
@@ -5,16 +5,18 @@
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
import {
|
||||
evalTest,
|
||||
assertModelHasOutput,
|
||||
checkModelOutputContent,
|
||||
} from '../integration-tests/test-helper.js';
|
||||
} from './test-helper.js';
|
||||
|
||||
describe('save_memory', () => {
|
||||
const TEST_PREFIX = 'Save memory test: ';
|
||||
const rememberingFavoriteColor = "Agent remembers user's favorite color";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingFavoriteColor,
|
||||
|
||||
prompt: `remember that my favorite color is blue.
|
||||
@@ -35,6 +37,8 @@ describe('save_memory', () => {
|
||||
});
|
||||
const rememberingCommandRestrictions = 'Agent remembers command restrictions';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingCommandRestrictions,
|
||||
|
||||
prompt: `I don't want you to ever run npm commands.`,
|
||||
@@ -54,6 +58,8 @@ describe('save_memory', () => {
|
||||
|
||||
const rememberingWorkflow = 'Agent remembers workflow preferences';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingWorkflow,
|
||||
|
||||
prompt: `I want you to always lint after building.`,
|
||||
@@ -74,6 +80,8 @@ describe('save_memory', () => {
|
||||
const ignoringTemporaryInformation =
|
||||
'Agent ignores temporary conversation details';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringTemporaryInformation,
|
||||
|
||||
prompt: `I'm going to get a coffee.`,
|
||||
@@ -97,6 +105,8 @@ describe('save_memory', () => {
|
||||
|
||||
const rememberingPetName = "Agent remembers user's pet's name";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingPetName,
|
||||
|
||||
prompt: `Please remember that my dog's name is Buddy.`,
|
||||
@@ -116,6 +126,8 @@ describe('save_memory', () => {
|
||||
|
||||
const rememberingCommandAlias = 'Agent remembers custom command aliases';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingCommandAlias,
|
||||
|
||||
prompt: `When I say 'start server', you should run 'npm run dev'.`,
|
||||
@@ -136,6 +148,8 @@ describe('save_memory', () => {
|
||||
const ignoringDbSchemaLocation =
|
||||
"Agent ignores workspace's database schema location";
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringDbSchemaLocation,
|
||||
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
|
||||
assert: async (rig, result) => {
|
||||
@@ -155,6 +169,8 @@ describe('save_memory', () => {
|
||||
const rememberingCodingStyle =
|
||||
"Agent remembers user's coding style preference";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingCodingStyle,
|
||||
|
||||
prompt: `I prefer to use tabs instead of spaces for indentation.`,
|
||||
@@ -175,6 +191,8 @@ describe('save_memory', () => {
|
||||
const ignoringBuildArtifactLocation =
|
||||
'Agent ignores workspace build artifact location';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringBuildArtifactLocation,
|
||||
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
|
||||
assert: async (rig, result) => {
|
||||
@@ -193,6 +211,8 @@ describe('save_memory', () => {
|
||||
|
||||
const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringMainEntryPoint,
|
||||
prompt: `The main entry point for this workspace is \`src/index.js\`.`,
|
||||
assert: async (rig, result) => {
|
||||
@@ -211,6 +231,8 @@ describe('save_memory', () => {
|
||||
|
||||
const rememberingBirthday = "Agent remembers user's birthday";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingBirthday,
|
||||
|
||||
prompt: `My birthday is on June 15th.`,
|
||||
@@ -231,6 +253,8 @@ describe('save_memory', () => {
|
||||
const proactiveMemoryFromLongSession =
|
||||
'Agent saves preference from earlier in conversation history';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: proactiveMemoryFromLongSession,
|
||||
params: {
|
||||
settings: {
|
||||
@@ -309,6 +333,8 @@ describe('save_memory', () => {
|
||||
const memoryManagerRoutingPreferences =
|
||||
'Agent routes global and project preferences to memory';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: memoryManagerRoutingPreferences,
|
||||
params: {
|
||||
settings: {
|
||||
|
||||
Reference in New Issue
Block a user