mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-10 14:10:37 -07:00
Improving memory tool instructions and eval testing (#18091)
This commit is contained in:
@@ -6,11 +6,16 @@
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
import { validateModelOutput } from '../integration-tests/test-helper.js';
|
||||
import {
|
||||
assertModelHasOutput,
|
||||
checkModelOutputContent,
|
||||
} from '../integration-tests/test-helper.js';
|
||||
|
||||
describe('save_memory', () => {
|
||||
const TEST_PREFIX = 'Save memory test: ';
|
||||
const rememberingFavoriteColor = "Agent remembers user's favorite color";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: 'should be able to save to memory',
|
||||
name: rememberingFavoriteColor,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
@@ -18,13 +23,217 @@ describe('save_memory', () => {
|
||||
|
||||
what is my favorite color? tell me that and surround it with $ symbol`,
|
||||
assert: async (rig, result) => {
|
||||
const foundToolCall = await rig.waitForToolCall('save_memory');
|
||||
expect(
|
||||
foundToolCall,
|
||||
'Expected to find a save_memory tool call',
|
||||
).toBeTruthy();
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
validateModelOutput(result, 'blue', 'Save memory test');
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
expectedContent: 'blue',
|
||||
testName: `${TEST_PREFIX}${rememberingFavoriteColor}`,
|
||||
});
|
||||
},
|
||||
});
|
||||
const rememberingCommandRestrictions = 'Agent remembers command restrictions';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingCommandRestrictions,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
prompt: `I don't want you to ever run npm commands.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
expectedContent: [/not run npm commands|remember|ok/i],
|
||||
testName: `${TEST_PREFIX}${rememberingCommandRestrictions}`,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
const rememberingWorkflow = 'Agent remembers workflow preferences';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingWorkflow,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
prompt: `I want you to always lint after building.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
expectedContent: [/always|ok|remember|will do/i],
|
||||
testName: `${TEST_PREFIX}${rememberingWorkflow}`,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
const ignoringTemporaryInformation =
|
||||
'Agent ignores temporary conversation details';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: ignoringTemporaryInformation,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
prompt: `I'm going to get a coffee.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForTelemetryReady();
|
||||
const wasToolCalled = rig
|
||||
.readToolLogs()
|
||||
.some((log) => log.toolRequest.name === 'save_memory');
|
||||
expect(
|
||||
wasToolCalled,
|
||||
'save_memory should not be called for temporary information',
|
||||
).toBe(false);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
testName: `${TEST_PREFIX}${ignoringTemporaryInformation}`,
|
||||
forbiddenContent: [/remember|will do/i],
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
const rememberingPetName = "Agent remembers user's pet's name";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingPetName,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
prompt: `My dog's name is Buddy. What is my dog's name?`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
expectedContent: [/Buddy/i],
|
||||
testName: `${TEST_PREFIX}${rememberingPetName}`,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
const rememberingCommandAlias = 'Agent remembers custom command aliases';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingCommandAlias,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
prompt: `When I say 'start server', you should run 'npm run dev'.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
expectedContent: [/npm run dev|start server|ok|remember|will do/i],
|
||||
testName: `${TEST_PREFIX}${rememberingCommandAlias}`,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
const rememberingDbSchemaLocation =
|
||||
"Agent remembers project's database schema location";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingDbSchemaLocation,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
prompt: `The database schema for this project is located in \`db/schema.sql\`.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
expectedContent: [/database schema|ok|remember|will do/i],
|
||||
testName: `${TEST_PREFIX}${rememberingDbSchemaLocation}`,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
const rememberingCodingStyle =
|
||||
"Agent remembers user's coding style preference";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingCodingStyle,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
prompt: `I prefer to use tabs instead of spaces for indentation.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
expectedContent: [/tabs instead of spaces|ok|remember|will do/i],
|
||||
testName: `${TEST_PREFIX}${rememberingCodingStyle}`,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
const rememberingTestCommand =
|
||||
'Agent remembers specific project test command';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingTestCommand,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
prompt: `The command to run all backend tests is \`npm run test:backend\`.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
expectedContent: [
|
||||
/command to run all backend tests|ok|remember|will do/i,
|
||||
],
|
||||
testName: `${TEST_PREFIX}${rememberingTestCommand}`,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
const rememberingMainEntryPoint =
|
||||
"Agent remembers project's main entry point";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
name: rememberingMainEntryPoint,
|
||||
params: {
|
||||
settings: { tools: { core: ['save_memory'] } },
|
||||
},
|
||||
prompt: `The main entry point for this project is \`src/index.js\`.`,
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
checkModelOutputContent(result, {
|
||||
expectedContent: [
|
||||
/main entry point for this project|ok|remember|will do/i,
|
||||
],
|
||||
testName: `${TEST_PREFIX}${rememberingMainEntryPoint}`,
|
||||
});
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user