mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-17 00:31:44 -07:00
fix(evals): update eval tests for invoke_agent telemetry and project-scoped memory (#25502)
This commit is contained in:
@@ -17,9 +17,17 @@ describe('CliHelpAgent Delegation', () => {
|
||||
timeout: 60000,
|
||||
assert: async (rig, _result) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const toolCallIndex = toolLogs.findIndex(
|
||||
(log) => log.toolRequest.name === 'cli_help',
|
||||
);
|
||||
const toolCallIndex = toolLogs.findIndex((log) => {
|
||||
if (log.toolRequest.name === 'invoke_agent') {
|
||||
try {
|
||||
const args = JSON.parse(log.toolRequest.args);
|
||||
return args.agent_name === 'cli_help';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
expect(toolCallIndex).toBeGreaterThan(-1);
|
||||
expect(toolCallIndex).toBeLessThan(5); // Called within first 5 turns
|
||||
},
|
||||
|
||||
@@ -26,11 +26,22 @@ describe('generalist_agent', () => {
|
||||
prompt:
|
||||
'Please use the generalist agent to create a file called "generalist_test_file.txt" containing exactly the following text: success',
|
||||
assert: async (rig) => {
|
||||
// 1) Verify the generalist agent was invoked
|
||||
const foundToolCall = await rig.waitForToolCall('generalist');
|
||||
// 1) Verify the generalist agent was invoked via invoke_agent
|
||||
const foundToolCall = await rig.waitForToolCall(
|
||||
'invoke_agent',
|
||||
undefined,
|
||||
(args) => {
|
||||
try {
|
||||
const parsed = JSON.parse(args);
|
||||
return parsed.agent_name === 'generalist';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
);
|
||||
expect(
|
||||
foundToolCall,
|
||||
'Expected to find a tool call for generalist agent',
|
||||
'Expected to find an invoke_agent tool call for generalist agent',
|
||||
).toBeTruthy();
|
||||
|
||||
// 2) Verify the file was created as expected
|
||||
|
||||
@@ -145,22 +145,30 @@ describe('save_memory', () => {
|
||||
},
|
||||
});
|
||||
|
||||
const ignoringDbSchemaLocation =
|
||||
"Agent ignores workspace's database schema location";
|
||||
const savingDbSchemaLocationAsProjectMemory =
|
||||
'Agent saves workspace database schema location as project memory';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringDbSchemaLocation,
|
||||
name: savingDbSchemaLocationAsProjectMemory,
|
||||
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForTelemetryReady();
|
||||
const wasToolCalled = rig
|
||||
.readToolLogs()
|
||||
.some((log) => log.toolRequest.name === 'save_memory');
|
||||
const wasToolCalled = await rig.waitForToolCall(
|
||||
'save_memory',
|
||||
undefined,
|
||||
(args) => {
|
||||
try {
|
||||
const params = JSON.parse(args);
|
||||
return params.scope === 'project';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
);
|
||||
expect(
|
||||
wasToolCalled,
|
||||
'save_memory should not be called for workspace-specific information',
|
||||
).toBe(false);
|
||||
'Expected save_memory to be called with scope="project" for workspace-specific information',
|
||||
).toBe(true);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
},
|
||||
@@ -188,42 +196,59 @@ describe('save_memory', () => {
|
||||
},
|
||||
});
|
||||
|
||||
const ignoringBuildArtifactLocation =
|
||||
'Agent ignores workspace build artifact location';
|
||||
const savingBuildArtifactLocationAsProjectMemory =
|
||||
'Agent saves workspace build artifact location as project memory';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringBuildArtifactLocation,
|
||||
name: savingBuildArtifactLocationAsProjectMemory,
|
||||
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForTelemetryReady();
|
||||
const wasToolCalled = rig
|
||||
.readToolLogs()
|
||||
.some((log) => log.toolRequest.name === 'save_memory');
|
||||
const wasToolCalled = await rig.waitForToolCall(
|
||||
'save_memory',
|
||||
undefined,
|
||||
(args) => {
|
||||
try {
|
||||
const params = JSON.parse(args);
|
||||
return params.scope === 'project';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
);
|
||||
expect(
|
||||
wasToolCalled,
|
||||
'save_memory should not be called for workspace-specific information',
|
||||
).toBe(false);
|
||||
'Expected save_memory to be called with scope="project" for workspace-specific information',
|
||||
).toBe(true);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
},
|
||||
});
|
||||
|
||||
const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
|
||||
const savingMainEntryPointAsProjectMemory =
|
||||
'Agent saves workspace main entry point as project memory';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringMainEntryPoint,
|
||||
name: savingMainEntryPointAsProjectMemory,
|
||||
prompt: `The main entry point for this workspace is \`src/index.js\`.`,
|
||||
assert: async (rig, result) => {
|
||||
await rig.waitForTelemetryReady();
|
||||
const wasToolCalled = rig
|
||||
.readToolLogs()
|
||||
.some((log) => log.toolRequest.name === 'save_memory');
|
||||
const wasToolCalled = await rig.waitForToolCall(
|
||||
'save_memory',
|
||||
undefined,
|
||||
(args) => {
|
||||
try {
|
||||
const params = JSON.parse(args);
|
||||
return params.scope === 'project';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
},
|
||||
);
|
||||
expect(
|
||||
wasToolCalled,
|
||||
'save_memory should not be called for workspace-specific information',
|
||||
).toBe(false);
|
||||
'Expected save_memory to be called with scope="project" for workspace-specific information',
|
||||
).toBe(true);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
},
|
||||
@@ -317,13 +342,13 @@ describe('save_memory', () => {
|
||||
'Please save any persistent preferences or facts about me from our conversation to memory.',
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall(
|
||||
'save_memory',
|
||||
'invoke_agent',
|
||||
undefined,
|
||||
(args) => /vitest/i.test(args),
|
||||
(args) => /save_memory/i.test(args) && /vitest/i.test(args),
|
||||
);
|
||||
expect(
|
||||
wasToolCalled,
|
||||
'Expected save_memory to be called with the Vitest preference from the conversation history',
|
||||
'Expected invoke_agent to be called with save_memory agent and the Vitest preference from the conversation history',
|
||||
).toBe(true);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
@@ -379,8 +404,15 @@ describe('save_memory', () => {
|
||||
],
|
||||
prompt: 'Please save the preferences I mentioned earlier to memory.',
|
||||
assert: async (rig, result) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('save_memory');
|
||||
expect(wasToolCalled, 'Expected save_memory to be called').toBe(true);
|
||||
const wasToolCalled = await rig.waitForToolCall(
|
||||
'invoke_agent',
|
||||
undefined,
|
||||
(args) => /save_memory/i.test(args),
|
||||
);
|
||||
expect(
|
||||
wasToolCalled,
|
||||
'Expected invoke_agent to be called with save_memory agent',
|
||||
).toBe(true);
|
||||
|
||||
assertModelHasOutput(result);
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user