eval(save_memory): add multi-turn interactive evals for memoryManager (#23572)

This commit is contained in:
Sandy Tao
2026-03-23 15:58:55 -07:00
committed by GitHub
parent b35c12d8d0
commit f784e192fa
2 changed files with 189 additions and 1 deletions

View File

@@ -227,4 +227,136 @@ describe('save_memory', () => {
});
},
});
const proactiveMemoryFromLongSession =
'Agent saves preference from earlier in conversation history';
evalTest('USUALLY_PASSES', {
name: proactiveMemoryFromLongSession,
params: {
settings: {
experimental: { memoryManager: true },
},
},
messages: [
{
id: 'msg-1',
type: 'user',
content: [
{
text: 'By the way, I always prefer Vitest over Jest for testing in all my projects.',
},
],
timestamp: '2026-01-01T00:00:00Z',
},
{
id: 'msg-2',
type: 'gemini',
content: [{ text: 'Noted! What are you working on today?' }],
timestamp: '2026-01-01T00:00:05Z',
},
{
id: 'msg-3',
type: 'user',
content: [
{
text: "I'm debugging a failing API endpoint. The /users route returns a 500 error.",
},
],
timestamp: '2026-01-01T00:01:00Z',
},
{
id: 'msg-4',
type: 'gemini',
content: [
{
text: 'It looks like the database connection might not be initialized before the query runs.',
},
],
timestamp: '2026-01-01T00:01:10Z',
},
{
id: 'msg-5',
type: 'user',
content: [
{ text: 'Good catch — I fixed the import and the route works now.' },
],
timestamp: '2026-01-01T00:02:00Z',
},
{
id: 'msg-6',
type: 'gemini',
content: [{ text: 'Great! Anything else you would like to work on?' }],
timestamp: '2026-01-01T00:02:05Z',
},
],
prompt:
'Please save any persistent preferences or facts about me from our conversation to memory.',
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall(
'save_memory',
undefined,
(args) => /vitest/i.test(args),
);
expect(
wasToolCalled,
'Expected save_memory to be called with the Vitest preference from the conversation history',
).toBe(true);
assertModelHasOutput(result);
},
});
const memoryManagerRoutingPreferences =
'Agent routes global and project preferences to memory';
evalTest('USUALLY_PASSES', {
name: memoryManagerRoutingPreferences,
params: {
settings: {
experimental: { memoryManager: true },
},
},
messages: [
{
id: 'msg-1',
type: 'user',
content: [
{
text: 'I always use dark mode in all my editors and terminals.',
},
],
timestamp: '2026-01-01T00:00:00Z',
},
{
id: 'msg-2',
type: 'gemini',
content: [{ text: 'Got it, I will keep that in mind!' }],
timestamp: '2026-01-01T00:00:05Z',
},
{
id: 'msg-3',
type: 'user',
content: [
{
text: 'For this project specifically, we use 2-space indentation.',
},
],
timestamp: '2026-01-01T00:01:00Z',
},
{
id: 'msg-4',
type: 'gemini',
content: [
{ text: 'Understood, 2-space indentation for this project.' },
],
timestamp: '2026-01-01T00:01:05Z',
},
],
prompt: 'Please save the preferences I mentioned earlier to memory.',
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory to be called').toBe(true);
assertModelHasOutput(result);
},
});
});

View File

@@ -13,6 +13,9 @@ import { TestRig } from '@google/gemini-cli-test-utils';
import {
createUnauthorizedToolError,
parseAgentMarkdown,
Storage,
getProjectHash,
SESSION_FILE_PREFIX,
} from '@google/gemini-cli-core';
export * from '@google/gemini-cli-test-utils';
@@ -117,8 +120,57 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
}
// If messages are provided, write a session file so --resume can load it.
let sessionId: string | undefined;
if (evalCase.messages) {
sessionId =
evalCase.sessionId ||
`test-session-${crypto.randomUUID().slice(0, 8)}`;
// Temporarily set GEMINI_CLI_HOME so Storage writes to the same
// directory the CLI subprocess will use (rig.homeDir).
const originalGeminiHome = process.env['GEMINI_CLI_HOME'];
process.env['GEMINI_CLI_HOME'] = rig.homeDir!;
try {
const storage = new Storage(fs.realpathSync(rig.testDir!));
await storage.initialize();
const chatsDir = path.join(storage.getProjectTempDir(), 'chats');
fs.mkdirSync(chatsDir, { recursive: true });
const conversation = {
sessionId,
projectHash: getProjectHash(fs.realpathSync(rig.testDir!)),
startTime: new Date().toISOString(),
lastUpdated: new Date().toISOString(),
messages: evalCase.messages,
};
const timestamp = new Date()
.toISOString()
.slice(0, 16)
.replace(/:/g, '-');
const filename = `${SESSION_FILE_PREFIX}${timestamp}-${sessionId.slice(0, 8)}.json`;
fs.writeFileSync(
path.join(chatsDir, filename),
JSON.stringify(conversation, null, 2),
);
} catch (e) {
// Storage initialization may fail in some environments; log and continue.
console.warn('Failed to write session history:', e);
} finally {
// Restore original GEMINI_CLI_HOME.
if (originalGeminiHome === undefined) {
delete process.env['GEMINI_CLI_HOME'];
} else {
process.env['GEMINI_CLI_HOME'] = originalGeminiHome;
}
}
}
const result = await rig.run({
args: evalCase.prompt,
args: sessionId
? ['--resume', sessionId, evalCase.prompt]
: evalCase.prompt,
approvalMode: evalCase.approvalMode ?? 'yolo',
timeout: evalCase.timeout,
env: {
@@ -219,6 +271,10 @@ export interface EvalCase {
prompt: string;
timeout?: number;
files?: Record<string, string>;
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
messages?: Record<string, unknown>[];
/** Session ID for the resumed session. Auto-generated if not provided. */
sessionId?: string;
approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
assert: (rig: TestRig, result: string) => Promise<void>;
}