mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-25 05:21:03 -07:00
eval(save_memory): add multi-turn interactive evals for memoryManager (#23572)
This commit is contained in:
@@ -227,4 +227,136 @@ describe('save_memory', () => {
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
const proactiveMemoryFromLongSession =
|
||||
'Agent saves preference from earlier in conversation history';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: proactiveMemoryFromLongSession,
|
||||
params: {
|
||||
settings: {
|
||||
experimental: { memoryManager: true },
|
||||
},
|
||||
},
|
||||
messages: [
|
||||
{
|
||||
id: 'msg-1',
|
||||
type: 'user',
|
||||
content: [
|
||||
{
|
||||
text: 'By the way, I always prefer Vitest over Jest for testing in all my projects.',
|
||||
},
|
||||
],
|
||||
timestamp: '2026-01-01T00:00:00Z',
|
||||
},
|
||||
{
|
||||
id: 'msg-2',
|
||||
type: 'gemini',
|
||||
content: [{ text: 'Noted! What are you working on today?' }],
|
||||
timestamp: '2026-01-01T00:00:05Z',
|
||||
},
|
||||
{
|
||||
id: 'msg-3',
|
||||
type: 'user',
|
||||
content: [
|
||||
{
|
||||
text: "I'm debugging a failing API endpoint. The /users route returns a 500 error.",
|
||||
},
|
||||
],
|
||||
timestamp: '2026-01-01T00:01:00Z',
|
||||
},
|
||||
{
|
||||
id: 'msg-4',
|
||||
type: 'gemini',
|
||||
content: [
|
||||
{
|
||||
text: 'It looks like the database connection might not be initialized before the query runs.',
|
||||
},
|
||||
],
|
||||
timestamp: '2026-01-01T00:01:10Z',
|
||||
},
|
||||
{
|
||||
id: 'msg-5',
|
||||
type: 'user',
|
||||
content: [
|
||||
{ text: 'Good catch — I fixed the import and the route works now.' },
|
||||
],
|
||||
timestamp: '2026-01-01T00:02:00Z',
|
||||
},
|
||||
{
|
||||
id: 'msg-6',
|
||||
type: 'gemini',
|
||||
content: [{ text: 'Great! Anything else you would like to work on?' }],
|
||||
timestamp: '2026-01-01T00:02:05Z',
|
||||
},
|
||||
],
|
||||
prompt:
|
||||
'Please save any persistent preferences or facts about me from our conversation to memory.',
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall(
|
||||
'save_memory',
|
||||
undefined,
|
||||
(args) => /vitest/i.test(args),
|
||||
);
|
||||
expect(
|
||||
wasToolCalled,
|
||||
'Expected save_memory to be called with the Vitest preference from the conversation history',
|
||||
).toBe(true);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
},
|
||||
});
|
||||
|
||||
const memoryManagerRoutingPreferences =
|
||||
'Agent routes global and project preferences to memory';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: memoryManagerRoutingPreferences,
|
||||
params: {
|
||||
settings: {
|
||||
experimental: { memoryManager: true },
|
||||
},
|
||||
},
|
||||
messages: [
|
||||
{
|
||||
id: 'msg-1',
|
||||
type: 'user',
|
||||
content: [
|
||||
{
|
||||
text: 'I always use dark mode in all my editors and terminals.',
|
||||
},
|
||||
],
|
||||
timestamp: '2026-01-01T00:00:00Z',
|
||||
},
|
||||
{
|
||||
id: 'msg-2',
|
||||
type: 'gemini',
|
||||
content: [{ text: 'Got it, I will keep that in mind!' }],
|
||||
timestamp: '2026-01-01T00:00:05Z',
|
||||
},
|
||||
{
|
||||
id: 'msg-3',
|
||||
type: 'user',
|
||||
content: [
|
||||
{
|
||||
text: 'For this project specifically, we use 2-space indentation.',
|
||||
},
|
||||
],
|
||||
timestamp: '2026-01-01T00:01:00Z',
|
||||
},
|
||||
{
|
||||
id: 'msg-4',
|
||||
type: 'gemini',
|
||||
content: [
|
||||
{ text: 'Understood, 2-space indentation for this project.' },
|
||||
],
|
||||
timestamp: '2026-01-01T00:01:05Z',
|
||||
},
|
||||
],
|
||||
prompt: 'Please save the preferences I mentioned earlier to memory.',
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory to be called').toBe(true);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,6 +13,9 @@ import { TestRig } from '@google/gemini-cli-test-utils';
|
||||
import {
|
||||
createUnauthorizedToolError,
|
||||
parseAgentMarkdown,
|
||||
Storage,
|
||||
getProjectHash,
|
||||
SESSION_FILE_PREFIX,
|
||||
} from '@google/gemini-cli-core';
|
||||
|
||||
export * from '@google/gemini-cli-test-utils';
|
||||
@@ -117,8 +120,57 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
|
||||
}
|
||||
|
||||
// If messages are provided, write a session file so --resume can load it.
|
||||
let sessionId: string | undefined;
|
||||
if (evalCase.messages) {
|
||||
sessionId =
|
||||
evalCase.sessionId ||
|
||||
`test-session-${crypto.randomUUID().slice(0, 8)}`;
|
||||
|
||||
// Temporarily set GEMINI_CLI_HOME so Storage writes to the same
|
||||
// directory the CLI subprocess will use (rig.homeDir).
|
||||
const originalGeminiHome = process.env['GEMINI_CLI_HOME'];
|
||||
process.env['GEMINI_CLI_HOME'] = rig.homeDir!;
|
||||
try {
|
||||
const storage = new Storage(fs.realpathSync(rig.testDir!));
|
||||
await storage.initialize();
|
||||
const chatsDir = path.join(storage.getProjectTempDir(), 'chats');
|
||||
fs.mkdirSync(chatsDir, { recursive: true });
|
||||
|
||||
const conversation = {
|
||||
sessionId,
|
||||
projectHash: getProjectHash(fs.realpathSync(rig.testDir!)),
|
||||
startTime: new Date().toISOString(),
|
||||
lastUpdated: new Date().toISOString(),
|
||||
messages: evalCase.messages,
|
||||
};
|
||||
|
||||
const timestamp = new Date()
|
||||
.toISOString()
|
||||
.slice(0, 16)
|
||||
.replace(/:/g, '-');
|
||||
const filename = `${SESSION_FILE_PREFIX}${timestamp}-${sessionId.slice(0, 8)}.json`;
|
||||
fs.writeFileSync(
|
||||
path.join(chatsDir, filename),
|
||||
JSON.stringify(conversation, null, 2),
|
||||
);
|
||||
} catch (e) {
|
||||
// Storage initialization may fail in some environments; log and continue.
|
||||
console.warn('Failed to write session history:', e);
|
||||
} finally {
|
||||
// Restore original GEMINI_CLI_HOME.
|
||||
if (originalGeminiHome === undefined) {
|
||||
delete process.env['GEMINI_CLI_HOME'];
|
||||
} else {
|
||||
process.env['GEMINI_CLI_HOME'] = originalGeminiHome;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const result = await rig.run({
|
||||
args: evalCase.prompt,
|
||||
args: sessionId
|
||||
? ['--resume', sessionId, evalCase.prompt]
|
||||
: evalCase.prompt,
|
||||
approvalMode: evalCase.approvalMode ?? 'yolo',
|
||||
timeout: evalCase.timeout,
|
||||
env: {
|
||||
@@ -219,6 +271,10 @@ export interface EvalCase {
|
||||
prompt: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
|
||||
messages?: Record<string, unknown>[];
|
||||
/** Session ID for the resumed session. Auto-generated if not provided. */
|
||||
sessionId?: string;
|
||||
approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
|
||||
assert: (rig: TestRig, result: string) => Promise<void>;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user