fix(evals): update eval tests for invoke_agent telemetry and project-scoped memory (#25502)

This commit is contained in:
Sandy Tao
2026-04-15 16:14:12 -07:00
committed by GitHub
parent e827cfdf83
commit 00b7781c3c
3 changed files with 88 additions and 37 deletions
+11 -3
View File
@@ -17,9 +17,17 @@ describe('CliHelpAgent Delegation', () => {
timeout: 60000, timeout: 60000,
assert: async (rig, _result) => { assert: async (rig, _result) => {
const toolLogs = rig.readToolLogs(); const toolLogs = rig.readToolLogs();
const toolCallIndex = toolLogs.findIndex( const toolCallIndex = toolLogs.findIndex((log) => {
(log) => log.toolRequest.name === 'cli_help', if (log.toolRequest.name === 'invoke_agent') {
); try {
const args = JSON.parse(log.toolRequest.args);
return args.agent_name === 'cli_help';
} catch {
return false;
}
}
return false;
});
expect(toolCallIndex).toBeGreaterThan(-1); expect(toolCallIndex).toBeGreaterThan(-1);
expect(toolCallIndex).toBeLessThan(5); // Called within first 5 turns expect(toolCallIndex).toBeLessThan(5); // Called within first 5 turns
}, },
+14 -3
View File
@@ -26,11 +26,22 @@ describe('generalist_agent', () => {
prompt: prompt:
'Please use the generalist agent to create a file called "generalist_test_file.txt" containing exactly the following text: success', 'Please use the generalist agent to create a file called "generalist_test_file.txt" containing exactly the following text: success',
assert: async (rig) => { assert: async (rig) => {
// 1) Verify the generalist agent was invoked // 1) Verify the generalist agent was invoked via invoke_agent
const foundToolCall = await rig.waitForToolCall('generalist'); const foundToolCall = await rig.waitForToolCall(
'invoke_agent',
undefined,
(args) => {
try {
const parsed = JSON.parse(args);
return parsed.agent_name === 'generalist';
} catch {
return false;
}
},
);
expect( expect(
foundToolCall, foundToolCall,
'Expected to find a tool call for generalist agent', 'Expected to find an invoke_agent tool call for generalist agent',
).toBeTruthy(); ).toBeTruthy();
// 2) Verify the file was created as expected // 2) Verify the file was created as expected
+63 -31
View File
@@ -145,22 +145,30 @@ describe('save_memory', () => {
}, },
}); });
const ignoringDbSchemaLocation = const savingDbSchemaLocationAsProjectMemory =
"Agent ignores workspace's database schema location"; 'Agent saves workspace database schema location as project memory';
evalTest('USUALLY_PASSES', { evalTest('USUALLY_PASSES', {
suiteName: 'default', suiteName: 'default',
suiteType: 'behavioral', suiteType: 'behavioral',
name: ignoringDbSchemaLocation, name: savingDbSchemaLocationAsProjectMemory,
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`, prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
assert: async (rig, result) => { assert: async (rig, result) => {
await rig.waitForTelemetryReady(); const wasToolCalled = await rig.waitForToolCall(
const wasToolCalled = rig 'save_memory',
.readToolLogs() undefined,
.some((log) => log.toolRequest.name === 'save_memory'); (args) => {
try {
const params = JSON.parse(args);
return params.scope === 'project';
} catch {
return false;
}
},
);
expect( expect(
wasToolCalled, wasToolCalled,
'save_memory should not be called for workspace-specific information', 'Expected save_memory to be called with scope="project" for workspace-specific information',
).toBe(false); ).toBe(true);
assertModelHasOutput(result); assertModelHasOutput(result);
}, },
@@ -188,42 +196,59 @@ describe('save_memory', () => {
}, },
}); });
const ignoringBuildArtifactLocation = const savingBuildArtifactLocationAsProjectMemory =
'Agent ignores workspace build artifact location'; 'Agent saves workspace build artifact location as project memory';
evalTest('USUALLY_PASSES', { evalTest('USUALLY_PASSES', {
suiteName: 'default', suiteName: 'default',
suiteType: 'behavioral', suiteType: 'behavioral',
name: ignoringBuildArtifactLocation, name: savingBuildArtifactLocationAsProjectMemory,
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`, prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
assert: async (rig, result) => { assert: async (rig, result) => {
await rig.waitForTelemetryReady(); const wasToolCalled = await rig.waitForToolCall(
const wasToolCalled = rig 'save_memory',
.readToolLogs() undefined,
.some((log) => log.toolRequest.name === 'save_memory'); (args) => {
try {
const params = JSON.parse(args);
return params.scope === 'project';
} catch {
return false;
}
},
);
expect( expect(
wasToolCalled, wasToolCalled,
'save_memory should not be called for workspace-specific information', 'Expected save_memory to be called with scope="project" for workspace-specific information',
).toBe(false); ).toBe(true);
assertModelHasOutput(result); assertModelHasOutput(result);
}, },
}); });
const ignoringMainEntryPoint = "Agent ignores workspace's main entry point"; const savingMainEntryPointAsProjectMemory =
'Agent saves workspace main entry point as project memory';
evalTest('USUALLY_PASSES', { evalTest('USUALLY_PASSES', {
suiteName: 'default', suiteName: 'default',
suiteType: 'behavioral', suiteType: 'behavioral',
name: ignoringMainEntryPoint, name: savingMainEntryPointAsProjectMemory,
prompt: `The main entry point for this workspace is \`src/index.js\`.`, prompt: `The main entry point for this workspace is \`src/index.js\`.`,
assert: async (rig, result) => { assert: async (rig, result) => {
await rig.waitForTelemetryReady(); const wasToolCalled = await rig.waitForToolCall(
const wasToolCalled = rig 'save_memory',
.readToolLogs() undefined,
.some((log) => log.toolRequest.name === 'save_memory'); (args) => {
try {
const params = JSON.parse(args);
return params.scope === 'project';
} catch {
return false;
}
},
);
expect( expect(
wasToolCalled, wasToolCalled,
'save_memory should not be called for workspace-specific information', 'Expected save_memory to be called with scope="project" for workspace-specific information',
).toBe(false); ).toBe(true);
assertModelHasOutput(result); assertModelHasOutput(result);
}, },
@@ -317,13 +342,13 @@ describe('save_memory', () => {
'Please save any persistent preferences or facts about me from our conversation to memory.', 'Please save any persistent preferences or facts about me from our conversation to memory.',
assert: async (rig, result) => { assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall( const wasToolCalled = await rig.waitForToolCall(
'save_memory', 'invoke_agent',
undefined, undefined,
(args) => /vitest/i.test(args), (args) => /save_memory/i.test(args) && /vitest/i.test(args),
); );
expect( expect(
wasToolCalled, wasToolCalled,
'Expected save_memory to be called with the Vitest preference from the conversation history', 'Expected invoke_agent to be called with save_memory agent and the Vitest preference from the conversation history',
).toBe(true); ).toBe(true);
assertModelHasOutput(result); assertModelHasOutput(result);
@@ -379,8 +404,15 @@ describe('save_memory', () => {
], ],
prompt: 'Please save the preferences I mentioned earlier to memory.', prompt: 'Please save the preferences I mentioned earlier to memory.',
assert: async (rig, result) => { assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory'); const wasToolCalled = await rig.waitForToolCall(
expect(wasToolCalled, 'Expected save_memory to be called').toBe(true); 'invoke_agent',
undefined,
(args) => /save_memory/i.test(args),
);
expect(
wasToolCalled,
'Expected invoke_agent to be called with save_memory agent',
).toBe(true);
assertModelHasOutput(result); assertModelHasOutput(result);
}, },