eval(save_memory): add multi-turn interactive evals for memoryManager (#23572)

2026-05-12 12:54:07 -07:00 · 2026-03-23 15:58:55 -07:00
parent b35c12d8d0
commit f784e192fa
2 changed files with 189 additions and 1 deletions
@@ -227,4 +227,136 @@ describe('save_memory', () => {
      });
    },
  });
+
+  const proactiveMemoryFromLongSession =
+    'Agent saves preference from earlier in conversation history';
+  evalTest('USUALLY_PASSES', {
+    name: proactiveMemoryFromLongSession,
+    params: {
+      settings: {
+        experimental: { memoryManager: true },
+      },
+    },
+    messages: [
+      {
+        id: 'msg-1',
+        type: 'user',
+        content: [
+          {
+            text: 'By the way, I always prefer Vitest over Jest for testing in all my projects.',
+          },
+        ],
+        timestamp: '2026-01-01T00:00:00Z',
+      },
+      {
+        id: 'msg-2',
+        type: 'gemini',
+        content: [{ text: 'Noted! What are you working on today?' }],
+        timestamp: '2026-01-01T00:00:05Z',
+      },
+      {
+        id: 'msg-3',
+        type: 'user',
+        content: [
+          {
+            text: "I'm debugging a failing API endpoint. The /users route returns a 500 error.",
+          },
+        ],
+        timestamp: '2026-01-01T00:01:00Z',
+      },
+      {
+        id: 'msg-4',
+        type: 'gemini',
+        content: [
+          {
+            text: 'It looks like the database connection might not be initialized before the query runs.',
+          },
+        ],
+        timestamp: '2026-01-01T00:01:10Z',
+      },
+      {
+        id: 'msg-5',
+        type: 'user',
+        content: [
+          { text: 'Good catch — I fixed the import and the route works now.' },
+        ],
+        timestamp: '2026-01-01T00:02:00Z',
+      },
+      {
+        id: 'msg-6',
+        type: 'gemini',
+        content: [{ text: 'Great! Anything else you would like to work on?' }],
+        timestamp: '2026-01-01T00:02:05Z',
+      },
+    ],
+    prompt:
+      'Please save any persistent preferences or facts about me from our conversation to memory.',
+    assert: async (rig, result) => {
+      const wasToolCalled = await rig.waitForToolCall(
+        'save_memory',
+        undefined,
+        (args) => /vitest/i.test(args),
+      );
+      expect(
+        wasToolCalled,
+        'Expected save_memory to be called with the Vitest preference from the conversation history',
+      ).toBe(true);
+
+      assertModelHasOutput(result);
+    },
+  });
+
+  const memoryManagerRoutingPreferences =
+    'Agent routes global and project preferences to memory';
+  evalTest('USUALLY_PASSES', {
+    name: memoryManagerRoutingPreferences,
+    params: {
+      settings: {
+        experimental: { memoryManager: true },
+      },
+    },
+    messages: [
+      {
+        id: 'msg-1',
+        type: 'user',
+        content: [
+          {
+            text: 'I always use dark mode in all my editors and terminals.',
+          },
+        ],
+        timestamp: '2026-01-01T00:00:00Z',
+      },
+      {
+        id: 'msg-2',
+        type: 'gemini',
+        content: [{ text: 'Got it, I will keep that in mind!' }],
+        timestamp: '2026-01-01T00:00:05Z',
+      },
+      {
+        id: 'msg-3',
+        type: 'user',
+        content: [
+          {
+            text: 'For this project specifically, we use 2-space indentation.',
+          },
+        ],
+        timestamp: '2026-01-01T00:01:00Z',
+      },
+      {
+        id: 'msg-4',
+        type: 'gemini',
+        content: [
+          { text: 'Understood, 2-space indentation for this project.' },
+        ],
+        timestamp: '2026-01-01T00:01:05Z',
+      },
+    ],
+    prompt: 'Please save the preferences I mentioned earlier to memory.',
+    assert: async (rig, result) => {
+      const wasToolCalled = await rig.waitForToolCall('save_memory');
+      expect(wasToolCalled, 'Expected save_memory to be called').toBe(true);
+
+      assertModelHasOutput(result);
+    },
+  });
 });
@@ -13,6 +13,9 @@ import { TestRig } from '@google/gemini-cli-test-utils';
 import {
  createUnauthorizedToolError,
  parseAgentMarkdown,
+  Storage,
+  getProjectHash,
+  SESSION_FILE_PREFIX,
 } from '@google/gemini-cli-core';

 export * from '@google/gemini-cli-test-utils';
@@ -117,8 +120,57 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
        execSync('git commit --allow-empty -m "Initial commit"', execOptions);
      }

+      // If messages are provided, write a session file so --resume can load it.
+      let sessionId: string | undefined;
+      if (evalCase.messages) {
+        sessionId =
+          evalCase.sessionId ||
+          `test-session-${crypto.randomUUID().slice(0, 8)}`;
+
+        // Temporarily set GEMINI_CLI_HOME so Storage writes to the same
+        // directory the CLI subprocess will use (rig.homeDir).
+        const originalGeminiHome = process.env['GEMINI_CLI_HOME'];
+        process.env['GEMINI_CLI_HOME'] = rig.homeDir!;
+        try {
+          const storage = new Storage(fs.realpathSync(rig.testDir!));
+          await storage.initialize();
+          const chatsDir = path.join(storage.getProjectTempDir(), 'chats');
+          fs.mkdirSync(chatsDir, { recursive: true });
+
+          const conversation = {
+            sessionId,
+            projectHash: getProjectHash(fs.realpathSync(rig.testDir!)),
+            startTime: new Date().toISOString(),
+            lastUpdated: new Date().toISOString(),
+            messages: evalCase.messages,
+          };
+
+          const timestamp = new Date()
+            .toISOString()
+            .slice(0, 16)
+            .replace(/:/g, '-');
+          const filename = `${SESSION_FILE_PREFIX}${timestamp}-${sessionId.slice(0, 8)}.json`;
+          fs.writeFileSync(
+            path.join(chatsDir, filename),
+            JSON.stringify(conversation, null, 2),
+          );
+        } catch (e) {
+          // Storage initialization may fail in some environments; log and continue.
+          console.warn('Failed to write session history:', e);
+        } finally {
+          // Restore original GEMINI_CLI_HOME.
+          if (originalGeminiHome === undefined) {
+            delete process.env['GEMINI_CLI_HOME'];
+          } else {
+            process.env['GEMINI_CLI_HOME'] = originalGeminiHome;
+          }
+        }
+      }
+
      const result = await rig.run({
-        args: evalCase.prompt,
+        args: sessionId
+          ? ['--resume', sessionId, evalCase.prompt]
+          : evalCase.prompt,
        approvalMode: evalCase.approvalMode ?? 'yolo',
        timeout: evalCase.timeout,
        env: {
@@ -219,6 +271,10 @@ export interface EvalCase {
  prompt: string;
  timeout?: number;
  files?: Record<string, string>;
+  /** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
+  messages?: Record<string, unknown>[];
+  /** Session ID for the resumed session. Auto-generated if not provided. */
+  sessionId?: string;
  approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
  assert: (rig: TestRig, result: string) => Promise<void>;
 }