fix(evals): update eval tests for invoke_agent telemetry and project-scoped memory (#25502)

This commit is contained in:
Sandy Tao
2026-04-15 16:14:12 -07:00
committed by GitHub
parent e827cfdf83
commit 00b7781c3c
3 changed files with 88 additions and 37 deletions

View File

@@ -17,9 +17,17 @@ describe('CliHelpAgent Delegation', () => {
timeout: 60000,
assert: async (rig, _result) => {
const toolLogs = rig.readToolLogs();
const toolCallIndex = toolLogs.findIndex(
(log) => log.toolRequest.name === 'cli_help',
);
const toolCallIndex = toolLogs.findIndex((log) => {
if (log.toolRequest.name === 'invoke_agent') {
try {
const args = JSON.parse(log.toolRequest.args);
return args.agent_name === 'cli_help';
} catch {
return false;
}
}
return false;
});
expect(toolCallIndex).toBeGreaterThan(-1);
expect(toolCallIndex).toBeLessThan(5); // Called within first 5 turns
},

View File

@@ -26,11 +26,22 @@ describe('generalist_agent', () => {
prompt:
'Please use the generalist agent to create a file called "generalist_test_file.txt" containing exactly the following text: success',
assert: async (rig) => {
// 1) Verify the generalist agent was invoked
const foundToolCall = await rig.waitForToolCall('generalist');
// 1) Verify the generalist agent was invoked via invoke_agent
const foundToolCall = await rig.waitForToolCall(
'invoke_agent',
undefined,
(args) => {
try {
const parsed = JSON.parse(args);
return parsed.agent_name === 'generalist';
} catch {
return false;
}
},
);
expect(
foundToolCall,
'Expected to find a tool call for generalist agent',
'Expected to find an invoke_agent tool call for generalist agent',
).toBeTruthy();
// 2) Verify the file was created as expected

View File

@@ -145,22 +145,30 @@ describe('save_memory', () => {
},
});
const ignoringDbSchemaLocation =
"Agent ignores workspace's database schema location";
const savingDbSchemaLocationAsProjectMemory =
'Agent saves workspace database schema location as project memory';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringDbSchemaLocation,
name: savingDbSchemaLocationAsProjectMemory,
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
assert: async (rig, result) => {
await rig.waitForTelemetryReady();
const wasToolCalled = rig
.readToolLogs()
.some((log) => log.toolRequest.name === 'save_memory');
const wasToolCalled = await rig.waitForToolCall(
'save_memory',
undefined,
(args) => {
try {
const params = JSON.parse(args);
return params.scope === 'project';
} catch {
return false;
}
},
);
expect(
wasToolCalled,
'save_memory should not be called for workspace-specific information',
).toBe(false);
'Expected save_memory to be called with scope="project" for workspace-specific information',
).toBe(true);
assertModelHasOutput(result);
},
@@ -188,42 +196,59 @@ describe('save_memory', () => {
},
});
const ignoringBuildArtifactLocation =
'Agent ignores workspace build artifact location';
const savingBuildArtifactLocationAsProjectMemory =
'Agent saves workspace build artifact location as project memory';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringBuildArtifactLocation,
name: savingBuildArtifactLocationAsProjectMemory,
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
assert: async (rig, result) => {
await rig.waitForTelemetryReady();
const wasToolCalled = rig
.readToolLogs()
.some((log) => log.toolRequest.name === 'save_memory');
const wasToolCalled = await rig.waitForToolCall(
'save_memory',
undefined,
(args) => {
try {
const params = JSON.parse(args);
return params.scope === 'project';
} catch {
return false;
}
},
);
expect(
wasToolCalled,
'save_memory should not be called for workspace-specific information',
).toBe(false);
'Expected save_memory to be called with scope="project" for workspace-specific information',
).toBe(true);
assertModelHasOutput(result);
},
});
const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
const savingMainEntryPointAsProjectMemory =
'Agent saves workspace main entry point as project memory';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringMainEntryPoint,
name: savingMainEntryPointAsProjectMemory,
prompt: `The main entry point for this workspace is \`src/index.js\`.`,
assert: async (rig, result) => {
await rig.waitForTelemetryReady();
const wasToolCalled = rig
.readToolLogs()
.some((log) => log.toolRequest.name === 'save_memory');
const wasToolCalled = await rig.waitForToolCall(
'save_memory',
undefined,
(args) => {
try {
const params = JSON.parse(args);
return params.scope === 'project';
} catch {
return false;
}
},
);
expect(
wasToolCalled,
'save_memory should not be called for workspace-specific information',
).toBe(false);
'Expected save_memory to be called with scope="project" for workspace-specific information',
).toBe(true);
assertModelHasOutput(result);
},
@@ -317,13 +342,13 @@ describe('save_memory', () => {
'Please save any persistent preferences or facts about me from our conversation to memory.',
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall(
'save_memory',
'invoke_agent',
undefined,
(args) => /vitest/i.test(args),
(args) => /save_memory/i.test(args) && /vitest/i.test(args),
);
expect(
wasToolCalled,
'Expected save_memory to be called with the Vitest preference from the conversation history',
'Expected invoke_agent to be called with save_memory agent and the Vitest preference from the conversation history',
).toBe(true);
assertModelHasOutput(result);
@@ -379,8 +404,15 @@ describe('save_memory', () => {
],
prompt: 'Please save the preferences I mentioned earlier to memory.',
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory to be called').toBe(true);
const wasToolCalled = await rig.waitForToolCall(
'invoke_agent',
undefined,
(args) => /save_memory/i.test(args),
);
expect(
wasToolCalled,
'Expected invoke_agent to be called with save_memory agent',
).toBe(true);
assertModelHasOutput(result);
},