Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)

This commit is contained in:
Christian Gunderman
2026-04-08 23:57:26 +00:00
committed by GitHub
parent bc3ed61adb
commit f1bb2af6de
32 changed files with 475 additions and 133 deletions
+12
View File
@@ -19,6 +19,8 @@ describe('Answer vs. ask eval', () => {
* automatically modify the file, but instead asks for permission.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not edit files when asked to inspect for bugs',
prompt: 'Inspect app.ts for bugs',
files: FILES,
@@ -42,6 +44,8 @@ describe('Answer vs. ask eval', () => {
* does modify the file.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should edit files when asked to fix bug',
prompt: 'Fix the bug in app.ts - it should add numbers not subtract',
files: FILES,
@@ -66,6 +70,8 @@ describe('Answer vs. ask eval', () => {
* automatically modify the file, but instead asks for permission.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not edit when asking "any bugs"',
prompt: 'Any bugs in app.ts?',
files: FILES,
@@ -89,6 +95,8 @@ describe('Answer vs. ask eval', () => {
* automatically modify the file.
*/
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not edit files when asked a general question',
prompt: 'How does app.ts work?',
files: FILES,
@@ -112,6 +120,8 @@ describe('Answer vs. ask eval', () => {
* automatically modify the file.
*/
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not edit files when asked about style',
prompt: 'Is app.ts following good style?',
files: FILES,
@@ -135,6 +145,8 @@ describe('Answer vs. ask eval', () => {
* the agent does NOT automatically modify the file.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not edit files when user notes an issue',
prompt: 'The add function subtracts numbers.',
files: FILES,
+49 -49
View File
@@ -10,10 +10,13 @@ import {
runEval,
prepareLogDir,
symlinkNodeModules,
withEvalRetries,
prepareWorkspace,
type BaseEvalCase,
EVAL_MODEL,
} from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
/**
* Config overrides for evals, with tool-restriction fields explicitly
@@ -29,15 +32,13 @@ interface EvalConfigOverrides {
allowedTools?: never;
/** Restricting tools via mainAgentTools in evals is forbidden. */
mainAgentTools?: never;
[key: string]: unknown;
}
export interface AppEvalCase {
name: string;
export interface AppEvalCase extends BaseEvalCase {
configOverrides?: EvalConfigOverrides;
prompt: string;
timeout?: number;
files?: Record<string, string>;
setup?: (rig: AppRig) => Promise<void>;
assert: (rig: AppRig, output: string) => Promise<void>;
}
@@ -48,56 +49,55 @@ export interface AppEvalCase {
*/
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const fn = async () => {
const rig = new AppRig({
configOverrides: {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
},
});
await withEvalRetries(evalCase.name, async () => {
const rig = new AppRig({
configOverrides: {
model: EVAL_MODEL,
...evalCase.configOverrides,
},
});
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const logFile = path.join(logDir, `${sanitizedName}.log`);
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const logFile = path.join(logDir, `${sanitizedName}.log`);
try {
await rig.initialize();
try {
await rig.initialize();
const testDir = rig.getTestDir();
symlinkNodeModules(testDir);
const testDir = rig.getTestDir();
symlinkNodeModules(testDir);
// Setup initial files
if (evalCase.files) {
for (const [filePath, content] of Object.entries(evalCase.files)) {
const fullPath = path.join(testDir, filePath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
// Setup initial files
if (evalCase.files) {
// Note: AppRig does not use a separate homeDir, so we use testDir twice
await prepareWorkspace(testDir, testDir, evalCase.files);
}
// Run custom setup if provided (e.g. for breakpoints)
if (evalCase.setup) {
await evalCase.setup(rig);
}
// Render the app!
await rig.render();
// Wait for initial ready state
await rig.waitForIdle();
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
const output = rig.getStaticOutput();
await evalCase.assert(rig, output);
} finally {
const output = rig.getStaticOutput();
if (output) {
await fs.promises.writeFile(logFile, output);
}
await rig.unmount();
}
// Run custom setup if provided (e.g. for breakpoints)
if (evalCase.setup) {
await evalCase.setup(rig);
}
// Render the app!
await rig.render();
// Wait for initial ready state
await rig.waitForIdle();
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
const output = rig.getStaticOutput();
await evalCase.assert(rig, output);
} finally {
const output = rig.getStaticOutput();
if (output) {
await fs.promises.writeFile(logFile, output);
}
await rig.unmount();
}
});
};
runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
runEval(policy, evalCase, fn, (evalCase.timeout ?? 60000) + 10000);
}
+18 -6
View File
@@ -5,17 +5,21 @@
*/
import { describe, expect } from 'vitest';
import { appEvalTest, AppEvalCase } from './app-test-helper.js';
import { EvalPolicy } from './test-helper.js';
import { ApprovalMode, isRecord } from '@google/gemini-cli-core';
import { appEvalTest, type AppEvalCase } from './app-test-helper.js';
import { type EvalPolicy } from './test-helper.js';
function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const existingGeneral = evalCase.configOverrides?.['general'];
const generalBase = isRecord(existingGeneral) ? existingGeneral : {};
return appEvalTest(policy, {
...evalCase,
configOverrides: {
...evalCase.configOverrides,
approvalMode: ApprovalMode.DEFAULT,
general: {
...evalCase.configOverrides?.general,
approvalMode: 'default',
...generalBase,
enableAutoUpdate: false,
enableAutoUpdateNotification: false,
},
@@ -28,6 +32,8 @@ function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
describe('ask_user', () => {
askUserEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Agent uses AskUser tool to present multiple choice options',
prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`,
setup: async (rig) => {
@@ -43,6 +49,8 @@ describe('ask_user', () => {
});
askUserEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Agent uses AskUser tool to clarify ambiguous requirements',
files: {
'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }),
@@ -61,6 +69,8 @@ describe('ask_user', () => {
});
askUserEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Agent uses AskUser tool before performing significant ambiguous rework',
files: {
'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";',
@@ -82,8 +92,8 @@ describe('ask_user', () => {
]);
expect(confirmation, 'Expected a tool call confirmation').toBeDefined();
if (confirmation?.name === 'enter_plan_mode') {
rig.acceptConfirmation('enter_plan_mode');
if (confirmation?.toolName === 'enter_plan_mode') {
await rig.resolveTool('enter_plan_mode');
confirmation = await rig.waitForPendingConfirmation('ask_user');
}
@@ -101,6 +111,8 @@ describe('ask_user', () => {
// updates to clarify that shell command confirmation is handled by the UI.
// See fix: https://github.com/google-gemini/gemini-cli/pull/20504
askUserEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Agent does NOT use AskUser to confirm shell commands',
files: {
'package.json': JSON.stringify({
+4
View File
@@ -14,6 +14,8 @@ describe('Automated tool use', () => {
* a repro by guiding the agent into using the existing deficient script.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use automated tools (eslint --fix) to fix code style issues',
files: {
'package.json': JSON.stringify(
@@ -102,6 +104,8 @@ describe('Automated tool use', () => {
* instead of trying to edit the files itself.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use automated tools (prettier --write) to fix formatting issues',
files: {
'package.json': JSON.stringify(
+2
View File
@@ -3,6 +3,8 @@ import { evalTest } from './test-helper.js';
describe('CliHelpAgent Delegation', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should delegate to cli_help agent for subagent creation questions',
params: {
settings: {
+136
View File
@@ -0,0 +1,136 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import {
type EvalPolicy,
runEval,
prepareLogDir,
withEvalRetries,
prepareWorkspace,
type BaseEvalCase,
} from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
import os from 'node:os';
import { randomUUID } from 'node:crypto';
import {
Config,
type ConfigParameters,
AuthType,
ApprovalMode,
createPolicyEngineConfig,
ExtensionLoader,
IntegrityDataStatus,
makeFakeConfig,
type GeminiCLIExtension,
} from '@google/gemini-cli-core';
import { createMockSettings } from '../packages/cli/src/test-utils/settings.js';
// A minimal mock ExtensionManager to bypass integrity checks
class MockExtensionManager extends ExtensionLoader {
override getExtensions(): GeminiCLIExtension[] {
return [];
}
setRequestConsent = (): void => {};
setRequestSetting = (): void => {};
integrityManager = {
verifyExtensionIntegrity: async (): Promise<IntegrityDataStatus> =>
IntegrityDataStatus.VERIFIED,
storeExtensionIntegrity: async (): Promise<void> => undefined,
};
}
export interface ComponentEvalCase extends BaseEvalCase {
configOverrides?: Partial<ConfigParameters>;
setup?: (config: Config) => Promise<void>;
assert: (config: Config) => Promise<void>;
}
export class ComponentRig {
public config: Config | undefined;
public testDir: string;
public sessionId: string;
constructor(
private options: { configOverrides?: Partial<ConfigParameters> } = {},
) {
const uniqueId = randomUUID();
this.testDir = fs.mkdtempSync(
path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`),
);
this.sessionId = `test-session-${uniqueId}`;
}
async initialize() {
const settings = createMockSettings();
const policyEngineConfig = await createPolicyEngineConfig(
settings.merged,
ApprovalMode.DEFAULT,
);
const configParams: ConfigParameters = {
sessionId: this.sessionId,
targetDir: this.testDir,
cwd: this.testDir,
debugMode: false,
model: 'test-model',
interactive: false,
approvalMode: ApprovalMode.DEFAULT,
policyEngineConfig,
enableEventDrivenScheduler: false, // Don't need scheduler for direct component tests
extensionLoader: new MockExtensionManager(),
useAlternateBuffer: false,
...this.options.configOverrides,
};
this.config = makeFakeConfig(configParams);
await this.config.initialize();
// Refresh auth using USE_GEMINI to initialize the real BaseLlmClient
await this.config.refreshAuth(AuthType.USE_GEMINI);
}
async cleanup() {
fs.rmSync(this.testDir, { recursive: true, force: true });
}
}
/**
* A helper for running behavioral evaluations directly against backend components.
* It provides a fully initialized Config with real API access, bypassing the UI.
*/
export function componentEvalTest(
policy: EvalPolicy,
evalCase: ComponentEvalCase,
) {
const fn = async () => {
await withEvalRetries(evalCase.name, async () => {
const rig = new ComponentRig({
configOverrides: evalCase.configOverrides,
});
await prepareLogDir(evalCase.name);
try {
await rig.initialize();
if (evalCase.files) {
await prepareWorkspace(rig.testDir, rig.testDir, evalCase.files);
}
if (evalCase.setup) {
await evalCase.setup(rig.config!);
}
await evalCase.assert(rig.config!);
} finally {
await rig.cleanup();
}
});
};
runEval(policy, evalCase, fn, (evalCase.timeout ?? 60000) + 10000);
}
+2
View File
@@ -20,6 +20,8 @@ You are the mutation agent. Do the mutation requested.
describe('concurrency safety eval test cases', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'mutation agents are run in parallel when explicitly requested',
params: {
settings: {
+2
View File
@@ -13,6 +13,8 @@ describe('Edits location eval', () => {
* instead of creating a new one.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should update existing test file instead of creating a new one',
files: {
'package.json': JSON.stringify(
+6
View File
@@ -15,6 +15,8 @@ describe('Frugal reads eval', () => {
* nearby ranges into a single contiguous read to save tool calls.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use ranged read when nearby lines are targeted',
files: {
'package.json': JSON.stringify({
@@ -135,6 +137,8 @@ describe('Frugal reads eval', () => {
* apart to avoid the need to read the whole file.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use ranged read when targets are far apart',
files: {
'package.json': JSON.stringify({
@@ -204,6 +208,8 @@ describe('Frugal reads eval', () => {
* (e.g.: 10), as it's more efficient than many small ranged reads.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should read the entire file when there are many matches',
files: {
'package.json': JSON.stringify({
+2 -12
View File
@@ -13,18 +13,6 @@ import { evalTest } from './test-helper.js';
* This ensures the agent doesn't flood the context window with unnecessary search results.
*/
describe('Frugal Search', () => {
const getGrepParams = (call: any): any => {
let args = call.toolRequest.args;
if (typeof args === 'string') {
try {
args = JSON.parse(args);
} catch (e) {
// Ignore parse errors
}
}
return args;
};
/**
* Ensure that the agent makes use of either grep or ranged reads in fulfilling this task.
* The task is specifically phrased to not evoke "view" or "search" specifically because
@@ -33,6 +21,8 @@ describe('Frugal Search', () => {
* ranged reads.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use grep or ranged read for large files',
prompt: 'What year was legacy_processor.ts written?',
files: {
+2
View File
@@ -11,6 +11,8 @@ import fs from 'node:fs/promises';
describe('generalist_agent', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should be able to use generalist agent by explicitly asking the main agent to invoke it',
params: {
settings: {
+8
View File
@@ -11,6 +11,8 @@ describe('generalist_delegation', () => {
// --- Positive Evals (Should Delegate) ---
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should delegate batch error fixing to generalist agent',
configOverrides: {
agents: {
@@ -54,6 +56,8 @@ describe('generalist_delegation', () => {
});
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should autonomously delegate complex batch task to generalist agent',
configOverrides: {
agents: {
@@ -94,6 +98,8 @@ describe('generalist_delegation', () => {
// --- Negative Evals (Should NOT Delegate - Assertive Handling) ---
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should NOT delegate simple read and fix to generalist agent',
configOverrides: {
agents: {
@@ -128,6 +134,8 @@ describe('generalist_delegation', () => {
});
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should NOT delegate simple direct question to generalist agent',
configOverrides: {
agents: {
+4
View File
@@ -26,6 +26,8 @@ describe('git repo eval', () => {
* be more consistent.
*/
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not git add commit changes unprompted',
prompt:
'Finish this up for me by just making a targeted fix for the bug in index.ts. Do not build, install anything, or add tests',
@@ -55,6 +57,8 @@ describe('git repo eval', () => {
* instructed to not do so by default.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should git commit changes when prompted',
prompt:
'Make a targeted fix for the bug in index.ts without building, installing anything, or adding tests. Then, commit your changes.',
+12
View File
@@ -15,6 +15,8 @@ describe('grep_search_functionality', () => {
const TEST_PREFIX = 'Grep Search Functionality: ';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should find a simple string in a file',
files: {
'test.txt': `hello
@@ -33,6 +35,8 @@ describe('grep_search_functionality', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should perform a case-sensitive search',
files: {
'test.txt': `Hello
@@ -63,6 +67,8 @@ describe('grep_search_functionality', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should return only file names when names_only is used',
files: {
'file1.txt': 'match me',
@@ -93,6 +99,8 @@ describe('grep_search_functionality', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should search only within the specified include_pattern glob',
files: {
'file.js': 'my_function();',
@@ -123,6 +131,8 @@ describe('grep_search_functionality', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should search within a specific subdirectory',
files: {
'src/main.js': 'unique_string_1',
@@ -153,6 +163,8 @@ describe('grep_search_functionality', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should report no matches correctly',
files: {
'file.txt': 'nothing to see here',
+7 -2
View File
@@ -5,13 +5,14 @@
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
import { assertModelHasOutput } from '../integration-tests/test-helper.js';
import { evalTest, assertModelHasOutput } from './test-helper.js';
describe('Hierarchical Memory', () => {
const conflictResolutionTest =
'Agent follows hierarchy for contradictory instructions';
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: conflictResolutionTest,
params: {
settings: {
@@ -48,6 +49,8 @@ What is my favorite fruit? Tell me just the name of the fruit.`,
const provenanceAwarenessTest = 'Agent is aware of memory provenance';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: provenanceAwarenessTest,
params: {
settings: {
@@ -87,6 +90,8 @@ Provide the answer as an XML block like this:
const extensionVsGlobalTest = 'Extension memory wins over Global memory';
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: extensionVsGlobalTest,
params: {
settings: {
+4
View File
@@ -8,6 +8,8 @@ describe('interactive_commands', () => {
* intervention.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not use interactive commands',
prompt: 'Execute tests.',
files: {
@@ -49,6 +51,8 @@ describe('interactive_commands', () => {
* Validates that the agent uses non-interactive flags when scaffolding a new project.
*/
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use non-interactive flags when scaffolding a new app',
prompt: 'Create a new react application named my-app using vite.',
assert: async (rig, result) => {
+4 -2
View File
@@ -5,14 +5,14 @@
*/
import { describe, expect } from 'vitest';
import { act } from 'react';
import path from 'node:path';
import fs from 'node:fs';
import { appEvalTest } from './app-test-helper.js';
import { PolicyDecision } from '@google/gemini-cli-core';
describe('Model Steering Behavioral Evals', () => {
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Corrective Hint: Model switches task based on hint during tool turn',
configOverrides: {
modelSteering: true,
@@ -52,6 +52,8 @@ describe('Model Steering Behavioral Evals', () => {
});
appEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
configOverrides: {
modelSteering: true,
+12
View File
@@ -33,6 +33,8 @@ describe('plan_mode', () => {
.filter(Boolean);
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should refuse file modification when in plan mode',
approvalMode: ApprovalMode.PLAN,
params: {
@@ -68,6 +70,8 @@ describe('plan_mode', () => {
});
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should refuse saving new documentation to the repo when in plan mode',
approvalMode: ApprovalMode.PLAN,
params: {
@@ -105,6 +109,8 @@ describe('plan_mode', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should enter plan mode when asked to create a plan',
approvalMode: ApprovalMode.DEFAULT,
params: {
@@ -122,6 +128,8 @@ describe('plan_mode', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should exit plan mode when plan is complete and implementation is requested',
approvalMode: ApprovalMode.PLAN,
params: {
@@ -169,6 +177,8 @@ describe('plan_mode', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should allow file modification in plans directory when in plan mode',
approvalMode: ApprovalMode.PLAN,
params: {
@@ -201,6 +211,8 @@ describe('plan_mode', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should create a plan in plan mode and implement it for a refactoring task',
params: {
settings,
+2
View File
@@ -11,6 +11,8 @@ import fs from 'node:fs/promises';
describe('redundant_casts', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not add redundant or unsafe casts when modifying typescript code',
files: {
'src/cast_example.ts': `
+2
View File
@@ -3,6 +3,8 @@ import { evalTest } from './test-helper.js';
describe('Sandbox recovery', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'attempts to use additional_permissions when operation not permitted',
prompt:
'Run ./script.sh. It will fail with "Operation not permitted". When it does, you must retry running it by passing the appropriate additional_permissions.',
+28 -2
View File
@@ -5,16 +5,18 @@
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
import {
evalTest,
assertModelHasOutput,
checkModelOutputContent,
} from '../integration-tests/test-helper.js';
} from './test-helper.js';
describe('save_memory', () => {
const TEST_PREFIX = 'Save memory test: ';
const rememberingFavoriteColor = "Agent remembers user's favorite color";
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingFavoriteColor,
prompt: `remember that my favorite color is blue.
@@ -35,6 +37,8 @@ describe('save_memory', () => {
});
const rememberingCommandRestrictions = 'Agent remembers command restrictions';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingCommandRestrictions,
prompt: `I don't want you to ever run npm commands.`,
@@ -54,6 +58,8 @@ describe('save_memory', () => {
const rememberingWorkflow = 'Agent remembers workflow preferences';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingWorkflow,
prompt: `I want you to always lint after building.`,
@@ -74,6 +80,8 @@ describe('save_memory', () => {
const ignoringTemporaryInformation =
'Agent ignores temporary conversation details';
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringTemporaryInformation,
prompt: `I'm going to get a coffee.`,
@@ -97,6 +105,8 @@ describe('save_memory', () => {
const rememberingPetName = "Agent remembers user's pet's name";
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingPetName,
prompt: `Please remember that my dog's name is Buddy.`,
@@ -116,6 +126,8 @@ describe('save_memory', () => {
const rememberingCommandAlias = 'Agent remembers custom command aliases';
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingCommandAlias,
prompt: `When I say 'start server', you should run 'npm run dev'.`,
@@ -136,6 +148,8 @@ describe('save_memory', () => {
const ignoringDbSchemaLocation =
"Agent ignores workspace's database schema location";
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringDbSchemaLocation,
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
assert: async (rig, result) => {
@@ -155,6 +169,8 @@ describe('save_memory', () => {
const rememberingCodingStyle =
"Agent remembers user's coding style preference";
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingCodingStyle,
prompt: `I prefer to use tabs instead of spaces for indentation.`,
@@ -175,6 +191,8 @@ describe('save_memory', () => {
const ignoringBuildArtifactLocation =
'Agent ignores workspace build artifact location';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringBuildArtifactLocation,
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
assert: async (rig, result) => {
@@ -193,6 +211,8 @@ describe('save_memory', () => {
const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: ignoringMainEntryPoint,
prompt: `The main entry point for this workspace is \`src/index.js\`.`,
assert: async (rig, result) => {
@@ -211,6 +231,8 @@ describe('save_memory', () => {
const rememberingBirthday = "Agent remembers user's birthday";
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: rememberingBirthday,
prompt: `My birthday is on June 15th.`,
@@ -231,6 +253,8 @@ describe('save_memory', () => {
const proactiveMemoryFromLongSession =
'Agent saves preference from earlier in conversation history';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: proactiveMemoryFromLongSession,
params: {
settings: {
@@ -309,6 +333,8 @@ describe('save_memory', () => {
const memoryManagerRoutingPreferences =
'Agent routes global and project preferences to memory';
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: memoryManagerRoutingPreferences,
params: {
settings: {
+6
View File
@@ -21,6 +21,8 @@ describe('Shell Efficiency', () => {
};
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use --silent/--quiet flags when installing packages',
prompt: 'Install the "lodash" package using npm.',
assert: async (rig) => {
@@ -50,6 +52,8 @@ describe('Shell Efficiency', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use --no-pager with git commands',
prompt: 'Show the git log.',
assert: async (rig) => {
@@ -73,6 +77,8 @@ describe('Shell Efficiency', () => {
});
evalTest('ALWAYS_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should NOT use efficiency flags when enableShellOutputEfficiency is disabled',
params: {
settings: {
+12
View File
@@ -45,6 +45,8 @@ describe('subagent eval test cases', () => {
* This tests the system prompt's subagent specific clauses.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should delegate to user provided agent with relevant expertise',
params: {
settings: {
@@ -69,6 +71,8 @@ describe('subagent eval test cases', () => {
* subagents are available. This helps catch orchestration overuse.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should avoid delegating trivial direct edit work',
params: {
settings: {
@@ -113,6 +117,8 @@ describe('subagent eval test cases', () => {
* This is meant to codify the "overusing Generalist" failure mode.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should prefer relevant specialist over generalist',
params: {
settings: {
@@ -149,6 +155,8 @@ describe('subagent eval test cases', () => {
* naturally spans docs and tests, so multiple specialists should be used.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should use multiple relevant specialists for multi-surface task',
params: {
settings: {
@@ -193,6 +201,8 @@ describe('subagent eval test cases', () => {
* from a large pool of available subagents (10 total).
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should select the correct subagent from a pool of 10 different agents',
prompt: 'Please add a new SQL table migration for a user profile.',
files: {
@@ -243,6 +253,8 @@ describe('subagent eval test cases', () => {
* This test includes stress tests the subagent delegation with ~80 tools.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
prompt: 'Please add a new SQL table migration for a user profile.',
setup: async (rig) => {
+12
View File
@@ -49,6 +49,8 @@ describe('evalTest reliability logic', () => {
// Execute the test function directly
await internalEvalTest({
suiteName: 'test',
suiteType: 'behavioral',
name: 'test-api-failure',
prompt: 'do something',
assert: async () => {},
@@ -83,6 +85,8 @@ describe('evalTest reliability logic', () => {
// Expect the test function to throw immediately
await expect(
internalEvalTest({
suiteName: 'test',
suiteType: 'behavioral',
name: 'test-logic-failure',
prompt: 'do something',
assert: async () => {
@@ -108,6 +112,8 @@ describe('evalTest reliability logic', () => {
.mockResolvedValueOnce('Success');
await internalEvalTest({
suiteName: 'test',
suiteType: 'behavioral',
name: 'test-recovery',
prompt: 'do something',
assert: async () => {},
@@ -135,6 +141,8 @@ describe('evalTest reliability logic', () => {
);
await internalEvalTest({
suiteName: 'test',
suiteType: 'behavioral',
name: 'test-api-503',
prompt: 'do something',
assert: async () => {},
@@ -162,6 +170,8 @@ describe('evalTest reliability logic', () => {
try {
await expect(
internalEvalTest({
suiteName: 'test',
suiteType: 'behavioral',
name: 'test-absolute-path',
prompt: 'do something',
files: {
@@ -190,6 +200,8 @@ describe('evalTest reliability logic', () => {
try {
await expect(
internalEvalTest({
suiteName: 'test',
suiteType: 'behavioral',
name: 'test-traversal',
prompt: 'do something',
files: {
+94 -54
View File
@@ -16,10 +16,19 @@ import {
Storage,
getProjectHash,
SESSION_FILE_PREFIX,
PREVIEW_GEMINI_FLASH_MODEL,
getErrorMessage,
} from '@google/gemini-cli-core';
export * from '@google/gemini-cli-test-utils';
/**
* The default model used for all evaluations.
* Can be overridden by setting the GEMINI_MODEL environment variable.
*/
export const EVAL_MODEL =
process.env['GEMINI_MODEL'] || PREVIEW_GEMINI_FLASH_MODEL;
// Indicates the consistency expectation for this test.
// - ALWAYS_PASSES - Means that the test is expected to pass 100% of the time. These
// These tests are typically trivial and test basic functionality with unambiguous
@@ -39,19 +48,49 @@ export * from '@google/gemini-cli-test-utils';
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
runEval(
policy,
evalCase.name,
() => internalEvalTest(evalCase),
evalCase.timeout,
);
runEval(policy, evalCase, () => internalEvalTest(evalCase));
}
export async function internalEvalTest(evalCase: EvalCase) {
export async function withEvalRetries(
name: string,
attemptFn: (attempt: number) => Promise<void>,
) {
const maxRetries = 3;
let attempt = 0;
while (attempt <= maxRetries) {
try {
await attemptFn(attempt);
return; // Success! Exit the retry loop.
} catch (error: unknown) {
const errorMessage = getErrorMessage(error);
const errorCode = getApiErrorCode(errorMessage);
if (errorCode) {
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
logReliabilityEvent(name, attempt, status, errorCode, errorMessage);
if (attempt < maxRetries) {
attempt++;
console.warn(
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
);
continue; // Retry
}
console.warn(
`[Eval] '${name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
);
return; // Gracefully exit without failing the test
}
throw error; // Real failure
}
}
}
export async function internalEvalTest(evalCase: EvalCase) {
await withEvalRetries(evalCase.name, async () => {
const rig = new TestRig();
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
@@ -59,14 +98,21 @@ export async function internalEvalTest(evalCase: EvalCase) {
let isSuccess = false;
try {
rig.setup(evalCase.name, evalCase.params);
const setupOptions = {
...evalCase.params,
settings: {
model: { name: EVAL_MODEL },
...evalCase.params?.settings,
},
};
rig.setup(evalCase.name, setupOptions);
if (evalCase.setup) {
await evalCase.setup(rig);
}
if (evalCase.files) {
await setupTestFiles(rig, evalCase.files);
await prepareWorkspace(rig.testDir!, rig.homeDir!, evalCase.files);
}
symlinkNodeModules(rig.testDir || '');
@@ -139,37 +185,6 @@ export async function internalEvalTest(evalCase: EvalCase) {
await evalCase.assert(rig, result);
isSuccess = true;
return; // Success! Exit the retry loop.
} catch (error: unknown) {
const errorMessage =
error instanceof Error ? error.message : String(error);
const errorCode = getApiErrorCode(errorMessage);
if (errorCode) {
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
logReliabilityEvent(
evalCase.name,
attempt,
status,
errorCode,
errorMessage,
);
if (attempt < maxRetries) {
attempt++;
console.warn(
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
);
continue; // Retry
}
console.warn(
`[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
);
return; // Gracefully exit without failing the test
}
throw error; // Real failure
} finally {
if (isSuccess) {
await fs.promises.unlink(activityLogFile).catch((err) => {
@@ -188,7 +203,7 @@ export async function internalEvalTest(evalCase: EvalCase) {
);
await rig.cleanup();
}
}
});
}
function getApiErrorCode(message: string): '500' | '503' | undefined {
@@ -226,7 +241,7 @@ function logReliabilityEvent(
const reliabilityLog = {
timestamp: new Date().toISOString(),
testName,
model: process.env.GEMINI_MODEL || 'unknown',
model: process.env['GEMINI_MODEL'] || 'unknown',
attempt,
status,
errorCode,
@@ -252,9 +267,13 @@ function logReliabilityEvent(
* intentionally uses synchronous filesystem and child_process operations
* for simplicity and to ensure sequential environment preparation.
*/
async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
export async function prepareWorkspace(
testDir: string,
homeDir: string,
files: Record<string, string>,
) {
const acknowledgedAgents: Record<string, Record<string, string>> = {};
const projectRoot = fs.realpathSync(rig.testDir!);
const projectRoot = fs.realpathSync(testDir);
for (const [filePath, content] of Object.entries(files)) {
if (filePath.includes('..') || path.isAbsolute(filePath)) {
@@ -290,7 +309,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
if (Object.keys(acknowledgedAgents).length > 0) {
const ackPath = path.join(
rig.homeDir!,
homeDir,
'.gemini',
'acknowledgments',
'agents.json',
@@ -299,7 +318,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
}
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
const execOptions = { cwd: testDir, stdio: 'ignore' as const };
execSync('git init --initial-branch=main', execOptions);
execSync('git config user.email "test@example.com"', execOptions);
execSync('git config user.name "Test User"', execOptions);
@@ -320,14 +339,30 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
*/
export function runEval(
policy: EvalPolicy,
name: string,
evalCase: BaseEvalCase,
fn: () => Promise<void>,
timeout?: number,
timeoutOverride?: number,
) {
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
it.skip(name, fn);
const { name, timeout, suiteName, suiteType } = evalCase;
const targetSuiteType = process.env['EVAL_SUITE_TYPE'];
const targetSuiteName = process.env['EVAL_SUITE_NAME'];
const meta = { suiteType, suiteName };
const skipBySuiteType =
targetSuiteType && suiteType && suiteType !== targetSuiteType;
const skipBySuiteName =
targetSuiteName && suiteName && suiteName !== targetSuiteName;
const options = { timeout: timeoutOverride ?? timeout, meta };
if (
(policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) ||
skipBySuiteType ||
skipBySuiteName
) {
it.skip(name, options, fn);
} else {
it(name, fn, timeout);
it(name, options, fn);
}
}
@@ -366,15 +401,20 @@ interface ForbiddenToolSettings {
};
}
export interface EvalCase {
export interface BaseEvalCase {
suiteName: string;
suiteType: 'behavioral' | 'component-level' | 'hero-scenario';
name: string;
timeout?: number;
files?: Record<string, string>;
}
export interface EvalCase extends BaseEvalCase {
params?: {
settings?: ForbiddenToolSettings & Record<string, unknown>;
[key: string]: unknown;
};
prompt: string;
timeout?: number;
files?: Record<string, string>;
setup?: (rig: TestRig) => Promise<void> | void;
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
messages?: Record<string, unknown>[];
+4
View File
@@ -31,6 +31,8 @@ describe('Tool Output Masking Behavioral Evals', () => {
* It should recognize the <tool_output_masked> tag and use a tool to read the file.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should attempt to read the redirected full output file when information is masked',
params: {
security: {
@@ -167,6 +169,8 @@ Output too large. Full output available at: ${outputFilePath}
* Scenario: Information is in the preview.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should NOT read the full output file when the information is already in the preview',
params: {
security: {
+4
View File
@@ -25,6 +25,8 @@ const FILES = {
describe('tracker_mode', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should manage tasks in the tracker when explicitly requested during a bug fix',
params: {
settings: { experimental: { taskTracker: true } },
@@ -78,6 +80,8 @@ describe('tracker_mode', () => {
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should implicitly create tasks when asked to build a feature plan',
params: {
settings: { experimental: { taskTracker: true } },
+2
View File
@@ -9,6 +9,8 @@ import { evalTest } from './test-helper.js';
describe('validation_fidelity', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should perform exhaustive validation autonomously when guided by system instructions',
files: {
'src/types.ts': `
@@ -9,6 +9,8 @@ import { evalTest } from './test-helper.js';
describe('validation_fidelity_pre_existing_errors', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should handle pre-existing project errors gracefully during validation',
files: {
'src/math.ts': `
+4 -1
View File
@@ -24,7 +24,10 @@ export default defineConfig({
environment: 'node',
globals: true,
alias: {
react: path.resolve(__dirname, '../node_modules/react'),
'@google/gemini-cli-core': path.resolve(
__dirname,
'../packages/core/index.ts',
),
},
setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
server: {