fix(evals): update eval tests for invoke_agent telemetry and project-scoped memory (#25502)

2026-05-12 12:54:07 -07:00 · 2026-04-15 16:14:12 -07:00
parent e827cfdf83
commit 00b7781c3c
3 changed files with 88 additions and 37 deletions
@@ -17,9 +17,17 @@ describe('CliHelpAgent Delegation', () => {
    timeout: 60000,
    assert: async (rig, _result) => {
      const toolLogs = rig.readToolLogs();
-      const toolCallIndex = toolLogs.findIndex(
+      const toolCallIndex = toolLogs.findIndex((log) => {
-        (log) => log.toolRequest.name === 'cli_help',
+        if (log.toolRequest.name === 'invoke_agent') {
-      );
+          try {
            const args = JSON.parse(log.toolRequest.args);
            return args.agent_name === 'cli_help';
          } catch {
            return false;
          }
        }
        return false;
      });
      expect(toolCallIndex).toBeGreaterThan(-1);
      expect(toolCallIndex).toBeLessThan(5); // Called within first 5 turns
    },
@@ -26,11 +26,22 @@ describe('generalist_agent', () => {
    prompt:
      'Please use the generalist agent to create a file called "generalist_test_file.txt" containing exactly the following text: success',
    assert: async (rig) => {
-      // 1) Verify the generalist agent was invoked
+      // 1) Verify the generalist agent was invoked via invoke_agent
-      const foundToolCall = await rig.waitForToolCall('generalist');
+      const foundToolCall = await rig.waitForToolCall(
        'invoke_agent',
        undefined,
        (args) => {
          try {
            const parsed = JSON.parse(args);
            return parsed.agent_name === 'generalist';
          } catch {
            return false;
          }
        },
      );
      expect(
        foundToolCall,
-        'Expected to find a tool call for generalist agent',
+        'Expected to find an invoke_agent tool call for generalist agent',
      ).toBeTruthy();
      // 2) Verify the file was created as expected
@@ -145,22 +145,30 @@ describe('save_memory', () => {
    },
  });
-  const ignoringDbSchemaLocation =
+  const savingDbSchemaLocationAsProjectMemory =
-    "Agent ignores workspace's database schema location";
+    'Agent saves workspace database schema location as project memory';
  evalTest('USUALLY_PASSES', {
    suiteName: 'default',
    suiteType: 'behavioral',
-    name: ignoringDbSchemaLocation,
+    name: savingDbSchemaLocationAsProjectMemory,
    prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
    assert: async (rig, result) => {
-      await rig.waitForTelemetryReady();
+      const wasToolCalled = await rig.waitForToolCall(
-      const wasToolCalled = rig
+        'save_memory',
-        .readToolLogs()
+        undefined,
-        .some((log) => log.toolRequest.name === 'save_memory');
+        (args) => {
          try {
            const params = JSON.parse(args);
            return params.scope === 'project';
          } catch {
            return false;
          }
        },
      );
      expect(
        wasToolCalled,
-        'save_memory should not be called for workspace-specific information',
+        'Expected save_memory to be called with scope="project" for workspace-specific information',
-      ).toBe(false);
+      ).toBe(true);
      assertModelHasOutput(result);
    },
@@ -188,42 +196,59 @@ describe('save_memory', () => {
    },
  });
-  const ignoringBuildArtifactLocation =
+  const savingBuildArtifactLocationAsProjectMemory =
-    'Agent ignores workspace build artifact location';
+    'Agent saves workspace build artifact location as project memory';
  evalTest('USUALLY_PASSES', {
    suiteName: 'default',
    suiteType: 'behavioral',
-    name: ignoringBuildArtifactLocation,
+    name: savingBuildArtifactLocationAsProjectMemory,
    prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
    assert: async (rig, result) => {
-      await rig.waitForTelemetryReady();
+      const wasToolCalled = await rig.waitForToolCall(
-      const wasToolCalled = rig
+        'save_memory',
-        .readToolLogs()
+        undefined,
-        .some((log) => log.toolRequest.name === 'save_memory');
+        (args) => {
          try {
            const params = JSON.parse(args);
            return params.scope === 'project';
          } catch {
            return false;
          }
        },
      );
      expect(
        wasToolCalled,
-        'save_memory should not be called for workspace-specific information',
+        'Expected save_memory to be called with scope="project" for workspace-specific information',
-      ).toBe(false);
+      ).toBe(true);
      assertModelHasOutput(result);
    },
  });
-  const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
+  const savingMainEntryPointAsProjectMemory =
    'Agent saves workspace main entry point as project memory';
  evalTest('USUALLY_PASSES', {
    suiteName: 'default',
    suiteType: 'behavioral',
-    name: ignoringMainEntryPoint,
+    name: savingMainEntryPointAsProjectMemory,
    prompt: `The main entry point for this workspace is \`src/index.js\`.`,
    assert: async (rig, result) => {
-      await rig.waitForTelemetryReady();
+      const wasToolCalled = await rig.waitForToolCall(
-      const wasToolCalled = rig
+        'save_memory',
-        .readToolLogs()
+        undefined,
-        .some((log) => log.toolRequest.name === 'save_memory');
+        (args) => {
          try {
            const params = JSON.parse(args);
            return params.scope === 'project';
          } catch {
            return false;
          }
        },
      );
      expect(
        wasToolCalled,
-        'save_memory should not be called for workspace-specific information',
+        'Expected save_memory to be called with scope="project" for workspace-specific information',
-      ).toBe(false);
+      ).toBe(true);
      assertModelHasOutput(result);
    },
@@ -317,13 +342,13 @@ describe('save_memory', () => {
      'Please save any persistent preferences or facts about me from our conversation to memory.',
    assert: async (rig, result) => {
      const wasToolCalled = await rig.waitForToolCall(
-        'save_memory',
+        'invoke_agent',
        undefined,
-        (args) => /vitest/i.test(args),
+        (args) => /save_memory/i.test(args) && /vitest/i.test(args),
      );
      expect(
        wasToolCalled,
-        'Expected save_memory to be called with the Vitest preference from the conversation history',
+        'Expected invoke_agent to be called with save_memory agent and the Vitest preference from the conversation history',
      ).toBe(true);
      assertModelHasOutput(result);
@@ -379,8 +404,15 @@ describe('save_memory', () => {
    ],
    prompt: 'Please save the preferences I mentioned earlier to memory.',
    assert: async (rig, result) => {
-      const wasToolCalled = await rig.waitForToolCall('save_memory');
+      const wasToolCalled = await rig.waitForToolCall(
-      expect(wasToolCalled, 'Expected save_memory to be called').toBe(true);
+        'invoke_agent',
        undefined,
        (args) => /save_memory/i.test(args),
      );
      expect(
        wasToolCalled,
        'Expected invoke_agent to be called with save_memory agent',
      ).toBe(true);
      assertModelHasOutput(result);
    },