mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-22 02:54:31 -07:00
Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)
This commit is contained in:
committed by
GitHub
parent
bc3ed61adb
commit
f1bb2af6de
@@ -19,6 +19,8 @@ describe('Answer vs. ask eval', () => {
|
||||
* automatically modify the file, but instead asks for permission.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not edit files when asked to inspect for bugs',
|
||||
prompt: 'Inspect app.ts for bugs',
|
||||
files: FILES,
|
||||
@@ -42,6 +44,8 @@ describe('Answer vs. ask eval', () => {
|
||||
* does modify the file.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should edit files when asked to fix bug',
|
||||
prompt: 'Fix the bug in app.ts - it should add numbers not subtract',
|
||||
files: FILES,
|
||||
@@ -66,6 +70,8 @@ describe('Answer vs. ask eval', () => {
|
||||
* automatically modify the file, but instead asks for permission.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not edit when asking "any bugs"',
|
||||
prompt: 'Any bugs in app.ts?',
|
||||
files: FILES,
|
||||
@@ -89,6 +95,8 @@ describe('Answer vs. ask eval', () => {
|
||||
* automatically modify the file.
|
||||
*/
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not edit files when asked a general question',
|
||||
prompt: 'How does app.ts work?',
|
||||
files: FILES,
|
||||
@@ -112,6 +120,8 @@ describe('Answer vs. ask eval', () => {
|
||||
* automatically modify the file.
|
||||
*/
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not edit files when asked about style',
|
||||
prompt: 'Is app.ts following good style?',
|
||||
files: FILES,
|
||||
@@ -135,6 +145,8 @@ describe('Answer vs. ask eval', () => {
|
||||
* the agent does NOT automatically modify the file.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not edit files when user notes an issue',
|
||||
prompt: 'The add function subtracts numbers.',
|
||||
files: FILES,
|
||||
|
||||
+49
-49
@@ -10,10 +10,13 @@ import {
|
||||
runEval,
|
||||
prepareLogDir,
|
||||
symlinkNodeModules,
|
||||
withEvalRetries,
|
||||
prepareWorkspace,
|
||||
type BaseEvalCase,
|
||||
EVAL_MODEL,
|
||||
} from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
|
||||
|
||||
/**
|
||||
* Config overrides for evals, with tool-restriction fields explicitly
|
||||
@@ -29,15 +32,13 @@ interface EvalConfigOverrides {
|
||||
allowedTools?: never;
|
||||
/** Restricting tools via mainAgentTools in evals is forbidden. */
|
||||
mainAgentTools?: never;
|
||||
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
export interface AppEvalCase {
|
||||
name: string;
|
||||
export interface AppEvalCase extends BaseEvalCase {
|
||||
configOverrides?: EvalConfigOverrides;
|
||||
prompt: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
setup?: (rig: AppRig) => Promise<void>;
|
||||
assert: (rig: AppRig, output: string) => Promise<void>;
|
||||
}
|
||||
@@ -48,56 +49,55 @@ export interface AppEvalCase {
|
||||
*/
|
||||
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
const fn = async () => {
|
||||
const rig = new AppRig({
|
||||
configOverrides: {
|
||||
model: DEFAULT_GEMINI_MODEL,
|
||||
...evalCase.configOverrides,
|
||||
},
|
||||
});
|
||||
await withEvalRetries(evalCase.name, async () => {
|
||||
const rig = new AppRig({
|
||||
configOverrides: {
|
||||
model: EVAL_MODEL,
|
||||
...evalCase.configOverrides,
|
||||
},
|
||||
});
|
||||
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const logFile = path.join(logDir, `${sanitizedName}.log`);
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const logFile = path.join(logDir, `${sanitizedName}.log`);
|
||||
|
||||
try {
|
||||
await rig.initialize();
|
||||
try {
|
||||
await rig.initialize();
|
||||
|
||||
const testDir = rig.getTestDir();
|
||||
symlinkNodeModules(testDir);
|
||||
const testDir = rig.getTestDir();
|
||||
symlinkNodeModules(testDir);
|
||||
|
||||
// Setup initial files
|
||||
if (evalCase.files) {
|
||||
for (const [filePath, content] of Object.entries(evalCase.files)) {
|
||||
const fullPath = path.join(testDir, filePath);
|
||||
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
||||
fs.writeFileSync(fullPath, content);
|
||||
// Setup initial files
|
||||
if (evalCase.files) {
|
||||
// Note: AppRig does not use a separate homeDir, so we use testDir twice
|
||||
await prepareWorkspace(testDir, testDir, evalCase.files);
|
||||
}
|
||||
|
||||
// Run custom setup if provided (e.g. for breakpoints)
|
||||
if (evalCase.setup) {
|
||||
await evalCase.setup(rig);
|
||||
}
|
||||
|
||||
// Render the app!
|
||||
await rig.render();
|
||||
|
||||
// Wait for initial ready state
|
||||
await rig.waitForIdle();
|
||||
|
||||
// Send the initial prompt
|
||||
await rig.sendMessage(evalCase.prompt);
|
||||
|
||||
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
|
||||
const output = rig.getStaticOutput();
|
||||
await evalCase.assert(rig, output);
|
||||
} finally {
|
||||
const output = rig.getStaticOutput();
|
||||
if (output) {
|
||||
await fs.promises.writeFile(logFile, output);
|
||||
}
|
||||
await rig.unmount();
|
||||
}
|
||||
|
||||
// Run custom setup if provided (e.g. for breakpoints)
|
||||
if (evalCase.setup) {
|
||||
await evalCase.setup(rig);
|
||||
}
|
||||
|
||||
// Render the app!
|
||||
await rig.render();
|
||||
|
||||
// Wait for initial ready state
|
||||
await rig.waitForIdle();
|
||||
|
||||
// Send the initial prompt
|
||||
await rig.sendMessage(evalCase.prompt);
|
||||
|
||||
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
|
||||
const output = rig.getStaticOutput();
|
||||
await evalCase.assert(rig, output);
|
||||
} finally {
|
||||
const output = rig.getStaticOutput();
|
||||
if (output) {
|
||||
await fs.promises.writeFile(logFile, output);
|
||||
}
|
||||
await rig.unmount();
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
|
||||
runEval(policy, evalCase, fn, (evalCase.timeout ?? 60000) + 10000);
|
||||
}
|
||||
|
||||
+18
-6
@@ -5,17 +5,21 @@
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { appEvalTest, AppEvalCase } from './app-test-helper.js';
|
||||
import { EvalPolicy } from './test-helper.js';
|
||||
import { ApprovalMode, isRecord } from '@google/gemini-cli-core';
|
||||
import { appEvalTest, type AppEvalCase } from './app-test-helper.js';
|
||||
import { type EvalPolicy } from './test-helper.js';
|
||||
|
||||
function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
const existingGeneral = evalCase.configOverrides?.['general'];
|
||||
const generalBase = isRecord(existingGeneral) ? existingGeneral : {};
|
||||
|
||||
return appEvalTest(policy, {
|
||||
...evalCase,
|
||||
configOverrides: {
|
||||
...evalCase.configOverrides,
|
||||
approvalMode: ApprovalMode.DEFAULT,
|
||||
general: {
|
||||
...evalCase.configOverrides?.general,
|
||||
approvalMode: 'default',
|
||||
...generalBase,
|
||||
enableAutoUpdate: false,
|
||||
enableAutoUpdateNotification: false,
|
||||
},
|
||||
@@ -28,6 +32,8 @@ function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
|
||||
describe('ask_user', () => {
|
||||
askUserEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Agent uses AskUser tool to present multiple choice options',
|
||||
prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`,
|
||||
setup: async (rig) => {
|
||||
@@ -43,6 +49,8 @@ describe('ask_user', () => {
|
||||
});
|
||||
|
||||
askUserEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Agent uses AskUser tool to clarify ambiguous requirements',
|
||||
files: {
|
||||
'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }),
|
||||
@@ -61,6 +69,8 @@ describe('ask_user', () => {
|
||||
});
|
||||
|
||||
askUserEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Agent uses AskUser tool before performing significant ambiguous rework',
|
||||
files: {
|
||||
'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";',
|
||||
@@ -82,8 +92,8 @@ describe('ask_user', () => {
|
||||
]);
|
||||
expect(confirmation, 'Expected a tool call confirmation').toBeDefined();
|
||||
|
||||
if (confirmation?.name === 'enter_plan_mode') {
|
||||
rig.acceptConfirmation('enter_plan_mode');
|
||||
if (confirmation?.toolName === 'enter_plan_mode') {
|
||||
await rig.resolveTool('enter_plan_mode');
|
||||
confirmation = await rig.waitForPendingConfirmation('ask_user');
|
||||
}
|
||||
|
||||
@@ -101,6 +111,8 @@ describe('ask_user', () => {
|
||||
// updates to clarify that shell command confirmation is handled by the UI.
|
||||
// See fix: https://github.com/google-gemini/gemini-cli/pull/20504
|
||||
askUserEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Agent does NOT use AskUser to confirm shell commands',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
|
||||
@@ -14,6 +14,8 @@ describe('Automated tool use', () => {
|
||||
* a repro by guiding the agent into using the existing deficient script.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use automated tools (eslint --fix) to fix code style issues',
|
||||
files: {
|
||||
'package.json': JSON.stringify(
|
||||
@@ -102,6 +104,8 @@ describe('Automated tool use', () => {
|
||||
* instead of trying to edit the files itself.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use automated tools (prettier --write) to fix formatting issues',
|
||||
files: {
|
||||
'package.json': JSON.stringify(
|
||||
|
||||
@@ -3,6 +3,8 @@ import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('CliHelpAgent Delegation', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should delegate to cli_help agent for subagent creation questions',
|
||||
params: {
|
||||
settings: {
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import {
|
||||
type EvalPolicy,
|
||||
runEval,
|
||||
prepareLogDir,
|
||||
withEvalRetries,
|
||||
prepareWorkspace,
|
||||
type BaseEvalCase,
|
||||
} from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import os from 'node:os';
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import {
|
||||
Config,
|
||||
type ConfigParameters,
|
||||
AuthType,
|
||||
ApprovalMode,
|
||||
createPolicyEngineConfig,
|
||||
ExtensionLoader,
|
||||
IntegrityDataStatus,
|
||||
makeFakeConfig,
|
||||
type GeminiCLIExtension,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { createMockSettings } from '../packages/cli/src/test-utils/settings.js';
|
||||
|
||||
// A minimal mock ExtensionManager to bypass integrity checks
|
||||
class MockExtensionManager extends ExtensionLoader {
|
||||
override getExtensions(): GeminiCLIExtension[] {
|
||||
return [];
|
||||
}
|
||||
setRequestConsent = (): void => {};
|
||||
setRequestSetting = (): void => {};
|
||||
integrityManager = {
|
||||
verifyExtensionIntegrity: async (): Promise<IntegrityDataStatus> =>
|
||||
IntegrityDataStatus.VERIFIED,
|
||||
storeExtensionIntegrity: async (): Promise<void> => undefined,
|
||||
};
|
||||
}
|
||||
|
||||
export interface ComponentEvalCase extends BaseEvalCase {
|
||||
configOverrides?: Partial<ConfigParameters>;
|
||||
setup?: (config: Config) => Promise<void>;
|
||||
assert: (config: Config) => Promise<void>;
|
||||
}
|
||||
|
||||
export class ComponentRig {
|
||||
public config: Config | undefined;
|
||||
public testDir: string;
|
||||
public sessionId: string;
|
||||
|
||||
constructor(
|
||||
private options: { configOverrides?: Partial<ConfigParameters> } = {},
|
||||
) {
|
||||
const uniqueId = randomUUID();
|
||||
this.testDir = fs.mkdtempSync(
|
||||
path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`),
|
||||
);
|
||||
this.sessionId = `test-session-${uniqueId}`;
|
||||
}
|
||||
|
||||
async initialize() {
|
||||
const settings = createMockSettings();
|
||||
const policyEngineConfig = await createPolicyEngineConfig(
|
||||
settings.merged,
|
||||
ApprovalMode.DEFAULT,
|
||||
);
|
||||
|
||||
const configParams: ConfigParameters = {
|
||||
sessionId: this.sessionId,
|
||||
targetDir: this.testDir,
|
||||
cwd: this.testDir,
|
||||
debugMode: false,
|
||||
model: 'test-model',
|
||||
interactive: false,
|
||||
approvalMode: ApprovalMode.DEFAULT,
|
||||
policyEngineConfig,
|
||||
enableEventDrivenScheduler: false, // Don't need scheduler for direct component tests
|
||||
extensionLoader: new MockExtensionManager(),
|
||||
useAlternateBuffer: false,
|
||||
...this.options.configOverrides,
|
||||
};
|
||||
|
||||
this.config = makeFakeConfig(configParams);
|
||||
await this.config.initialize();
|
||||
|
||||
// Refresh auth using USE_GEMINI to initialize the real BaseLlmClient
|
||||
await this.config.refreshAuth(AuthType.USE_GEMINI);
|
||||
}
|
||||
|
||||
async cleanup() {
|
||||
fs.rmSync(this.testDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper for running behavioral evaluations directly against backend components.
|
||||
* It provides a fully initialized Config with real API access, bypassing the UI.
|
||||
*/
|
||||
export function componentEvalTest(
|
||||
policy: EvalPolicy,
|
||||
evalCase: ComponentEvalCase,
|
||||
) {
|
||||
const fn = async () => {
|
||||
await withEvalRetries(evalCase.name, async () => {
|
||||
const rig = new ComponentRig({
|
||||
configOverrides: evalCase.configOverrides,
|
||||
});
|
||||
|
||||
await prepareLogDir(evalCase.name);
|
||||
|
||||
try {
|
||||
await rig.initialize();
|
||||
|
||||
if (evalCase.files) {
|
||||
await prepareWorkspace(rig.testDir, rig.testDir, evalCase.files);
|
||||
}
|
||||
|
||||
if (evalCase.setup) {
|
||||
await evalCase.setup(rig.config!);
|
||||
}
|
||||
|
||||
await evalCase.assert(rig.config!);
|
||||
} finally {
|
||||
await rig.cleanup();
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
runEval(policy, evalCase, fn, (evalCase.timeout ?? 60000) + 10000);
|
||||
}
|
||||
@@ -20,6 +20,8 @@ You are the mutation agent. Do the mutation requested.
|
||||
|
||||
describe('concurrency safety eval test cases', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'mutation agents are run in parallel when explicitly requested',
|
||||
params: {
|
||||
settings: {
|
||||
|
||||
@@ -13,6 +13,8 @@ describe('Edits location eval', () => {
|
||||
* instead of creating a new one.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should update existing test file instead of creating a new one',
|
||||
files: {
|
||||
'package.json': JSON.stringify(
|
||||
|
||||
@@ -15,6 +15,8 @@ describe('Frugal reads eval', () => {
|
||||
* nearby ranges into a single contiguous read to save tool calls.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use ranged read when nearby lines are targeted',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
@@ -135,6 +137,8 @@ describe('Frugal reads eval', () => {
|
||||
* apart to avoid the need to read the whole file.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use ranged read when targets are far apart',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
@@ -204,6 +208,8 @@ describe('Frugal reads eval', () => {
|
||||
* (e.g.: 10), as it's more efficient than many small ranged reads.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should read the entire file when there are many matches',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
|
||||
@@ -13,18 +13,6 @@ import { evalTest } from './test-helper.js';
|
||||
* This ensures the agent doesn't flood the context window with unnecessary search results.
|
||||
*/
|
||||
describe('Frugal Search', () => {
|
||||
const getGrepParams = (call: any): any => {
|
||||
let args = call.toolRequest.args;
|
||||
if (typeof args === 'string') {
|
||||
try {
|
||||
args = JSON.parse(args);
|
||||
} catch (e) {
|
||||
// Ignore parse errors
|
||||
}
|
||||
}
|
||||
return args;
|
||||
};
|
||||
|
||||
/**
|
||||
* Ensure that the agent makes use of either grep or ranged reads in fulfilling this task.
|
||||
* The task is specifically phrased to not evoke "view" or "search" specifically because
|
||||
@@ -33,6 +21,8 @@ describe('Frugal Search', () => {
|
||||
* ranged reads.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use grep or ranged read for large files',
|
||||
prompt: 'What year was legacy_processor.ts written?',
|
||||
files: {
|
||||
|
||||
@@ -11,6 +11,8 @@ import fs from 'node:fs/promises';
|
||||
|
||||
describe('generalist_agent', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should be able to use generalist agent by explicitly asking the main agent to invoke it',
|
||||
params: {
|
||||
settings: {
|
||||
|
||||
@@ -11,6 +11,8 @@ describe('generalist_delegation', () => {
|
||||
// --- Positive Evals (Should Delegate) ---
|
||||
|
||||
appEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should delegate batch error fixing to generalist agent',
|
||||
configOverrides: {
|
||||
agents: {
|
||||
@@ -54,6 +56,8 @@ describe('generalist_delegation', () => {
|
||||
});
|
||||
|
||||
appEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should autonomously delegate complex batch task to generalist agent',
|
||||
configOverrides: {
|
||||
agents: {
|
||||
@@ -94,6 +98,8 @@ describe('generalist_delegation', () => {
|
||||
// --- Negative Evals (Should NOT Delegate - Assertive Handling) ---
|
||||
|
||||
appEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should NOT delegate simple read and fix to generalist agent',
|
||||
configOverrides: {
|
||||
agents: {
|
||||
@@ -128,6 +134,8 @@ describe('generalist_delegation', () => {
|
||||
});
|
||||
|
||||
appEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should NOT delegate simple direct question to generalist agent',
|
||||
configOverrides: {
|
||||
agents: {
|
||||
|
||||
@@ -26,6 +26,8 @@ describe('git repo eval', () => {
|
||||
* be more consistent.
|
||||
*/
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not git add commit changes unprompted',
|
||||
prompt:
|
||||
'Finish this up for me by just making a targeted fix for the bug in index.ts. Do not build, install anything, or add tests',
|
||||
@@ -55,6 +57,8 @@ describe('git repo eval', () => {
|
||||
* instructed to not do so by default.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should git commit changes when prompted',
|
||||
prompt:
|
||||
'Make a targeted fix for the bug in index.ts without building, installing anything, or adding tests. Then, commit your changes.',
|
||||
|
||||
@@ -15,6 +15,8 @@ describe('grep_search_functionality', () => {
|
||||
const TEST_PREFIX = 'Grep Search Functionality: ';
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should find a simple string in a file',
|
||||
files: {
|
||||
'test.txt': `hello
|
||||
@@ -33,6 +35,8 @@ describe('grep_search_functionality', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should perform a case-sensitive search',
|
||||
files: {
|
||||
'test.txt': `Hello
|
||||
@@ -63,6 +67,8 @@ describe('grep_search_functionality', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should return only file names when names_only is used',
|
||||
files: {
|
||||
'file1.txt': 'match me',
|
||||
@@ -93,6 +99,8 @@ describe('grep_search_functionality', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should search only within the specified include_pattern glob',
|
||||
files: {
|
||||
'file.js': 'my_function();',
|
||||
@@ -123,6 +131,8 @@ describe('grep_search_functionality', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should search within a specific subdirectory',
|
||||
files: {
|
||||
'src/main.js': 'unique_string_1',
|
||||
@@ -153,6 +163,8 @@ describe('grep_search_functionality', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should report no matches correctly',
|
||||
files: {
|
||||
'file.txt': 'nothing to see here',
|
||||
|
||||
@@ -5,13 +5,14 @@
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
import { assertModelHasOutput } from '../integration-tests/test-helper.js';
|
||||
import { evalTest, assertModelHasOutput } from './test-helper.js';
|
||||
|
||||
describe('Hierarchical Memory', () => {
|
||||
const conflictResolutionTest =
|
||||
'Agent follows hierarchy for contradictory instructions';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: conflictResolutionTest,
|
||||
params: {
|
||||
settings: {
|
||||
@@ -48,6 +49,8 @@ What is my favorite fruit? Tell me just the name of the fruit.`,
|
||||
|
||||
const provenanceAwarenessTest = 'Agent is aware of memory provenance';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: provenanceAwarenessTest,
|
||||
params: {
|
||||
settings: {
|
||||
@@ -87,6 +90,8 @@ Provide the answer as an XML block like this:
|
||||
|
||||
const extensionVsGlobalTest = 'Extension memory wins over Global memory';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: extensionVsGlobalTest,
|
||||
params: {
|
||||
settings: {
|
||||
|
||||
@@ -8,6 +8,8 @@ describe('interactive_commands', () => {
|
||||
* intervention.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not use interactive commands',
|
||||
prompt: 'Execute tests.',
|
||||
files: {
|
||||
@@ -49,6 +51,8 @@ describe('interactive_commands', () => {
|
||||
* Validates that the agent uses non-interactive flags when scaffolding a new project.
|
||||
*/
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use non-interactive flags when scaffolding a new app',
|
||||
prompt: 'Create a new react application named my-app using vite.',
|
||||
assert: async (rig, result) => {
|
||||
|
||||
@@ -5,14 +5,14 @@
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { act } from 'react';
|
||||
import path from 'node:path';
|
||||
import fs from 'node:fs';
|
||||
import { appEvalTest } from './app-test-helper.js';
|
||||
import { PolicyDecision } from '@google/gemini-cli-core';
|
||||
|
||||
describe('Model Steering Behavioral Evals', () => {
|
||||
appEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Corrective Hint: Model switches task based on hint during tool turn',
|
||||
configOverrides: {
|
||||
modelSteering: true,
|
||||
@@ -52,6 +52,8 @@ describe('Model Steering Behavioral Evals', () => {
|
||||
});
|
||||
|
||||
appEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
|
||||
configOverrides: {
|
||||
modelSteering: true,
|
||||
|
||||
@@ -33,6 +33,8 @@ describe('plan_mode', () => {
|
||||
.filter(Boolean);
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should refuse file modification when in plan mode',
|
||||
approvalMode: ApprovalMode.PLAN,
|
||||
params: {
|
||||
@@ -68,6 +70,8 @@ describe('plan_mode', () => {
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should refuse saving new documentation to the repo when in plan mode',
|
||||
approvalMode: ApprovalMode.PLAN,
|
||||
params: {
|
||||
@@ -105,6 +109,8 @@ describe('plan_mode', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should enter plan mode when asked to create a plan',
|
||||
approvalMode: ApprovalMode.DEFAULT,
|
||||
params: {
|
||||
@@ -122,6 +128,8 @@ describe('plan_mode', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should exit plan mode when plan is complete and implementation is requested',
|
||||
approvalMode: ApprovalMode.PLAN,
|
||||
params: {
|
||||
@@ -169,6 +177,8 @@ describe('plan_mode', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should allow file modification in plans directory when in plan mode',
|
||||
approvalMode: ApprovalMode.PLAN,
|
||||
params: {
|
||||
@@ -201,6 +211,8 @@ describe('plan_mode', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should create a plan in plan mode and implement it for a refactoring task',
|
||||
params: {
|
||||
settings,
|
||||
|
||||
@@ -11,6 +11,8 @@ import fs from 'node:fs/promises';
|
||||
|
||||
describe('redundant_casts', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not add redundant or unsafe casts when modifying typescript code',
|
||||
files: {
|
||||
'src/cast_example.ts': `
|
||||
|
||||
@@ -3,6 +3,8 @@ import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('Sandbox recovery', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'attempts to use additional_permissions when operation not permitted',
|
||||
prompt:
|
||||
'Run ./script.sh. It will fail with "Operation not permitted". When it does, you must retry running it by passing the appropriate additional_permissions.',
|
||||
|
||||
@@ -5,16 +5,18 @@
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
import {
|
||||
evalTest,
|
||||
assertModelHasOutput,
|
||||
checkModelOutputContent,
|
||||
} from '../integration-tests/test-helper.js';
|
||||
} from './test-helper.js';
|
||||
|
||||
describe('save_memory', () => {
|
||||
const TEST_PREFIX = 'Save memory test: ';
|
||||
const rememberingFavoriteColor = "Agent remembers user's favorite color";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingFavoriteColor,
|
||||
|
||||
prompt: `remember that my favorite color is blue.
|
||||
@@ -35,6 +37,8 @@ describe('save_memory', () => {
|
||||
});
|
||||
const rememberingCommandRestrictions = 'Agent remembers command restrictions';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingCommandRestrictions,
|
||||
|
||||
prompt: `I don't want you to ever run npm commands.`,
|
||||
@@ -54,6 +58,8 @@ describe('save_memory', () => {
|
||||
|
||||
const rememberingWorkflow = 'Agent remembers workflow preferences';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingWorkflow,
|
||||
|
||||
prompt: `I want you to always lint after building.`,
|
||||
@@ -74,6 +80,8 @@ describe('save_memory', () => {
|
||||
const ignoringTemporaryInformation =
|
||||
'Agent ignores temporary conversation details';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringTemporaryInformation,
|
||||
|
||||
prompt: `I'm going to get a coffee.`,
|
||||
@@ -97,6 +105,8 @@ describe('save_memory', () => {
|
||||
|
||||
const rememberingPetName = "Agent remembers user's pet's name";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingPetName,
|
||||
|
||||
prompt: `Please remember that my dog's name is Buddy.`,
|
||||
@@ -116,6 +126,8 @@ describe('save_memory', () => {
|
||||
|
||||
const rememberingCommandAlias = 'Agent remembers custom command aliases';
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingCommandAlias,
|
||||
|
||||
prompt: `When I say 'start server', you should run 'npm run dev'.`,
|
||||
@@ -136,6 +148,8 @@ describe('save_memory', () => {
|
||||
const ignoringDbSchemaLocation =
|
||||
"Agent ignores workspace's database schema location";
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringDbSchemaLocation,
|
||||
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
|
||||
assert: async (rig, result) => {
|
||||
@@ -155,6 +169,8 @@ describe('save_memory', () => {
|
||||
const rememberingCodingStyle =
|
||||
"Agent remembers user's coding style preference";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingCodingStyle,
|
||||
|
||||
prompt: `I prefer to use tabs instead of spaces for indentation.`,
|
||||
@@ -175,6 +191,8 @@ describe('save_memory', () => {
|
||||
const ignoringBuildArtifactLocation =
|
||||
'Agent ignores workspace build artifact location';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringBuildArtifactLocation,
|
||||
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
|
||||
assert: async (rig, result) => {
|
||||
@@ -193,6 +211,8 @@ describe('save_memory', () => {
|
||||
|
||||
const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: ignoringMainEntryPoint,
|
||||
prompt: `The main entry point for this workspace is \`src/index.js\`.`,
|
||||
assert: async (rig, result) => {
|
||||
@@ -211,6 +231,8 @@ describe('save_memory', () => {
|
||||
|
||||
const rememberingBirthday = "Agent remembers user's birthday";
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: rememberingBirthday,
|
||||
|
||||
prompt: `My birthday is on June 15th.`,
|
||||
@@ -231,6 +253,8 @@ describe('save_memory', () => {
|
||||
const proactiveMemoryFromLongSession =
|
||||
'Agent saves preference from earlier in conversation history';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: proactiveMemoryFromLongSession,
|
||||
params: {
|
||||
settings: {
|
||||
@@ -309,6 +333,8 @@ describe('save_memory', () => {
|
||||
const memoryManagerRoutingPreferences =
|
||||
'Agent routes global and project preferences to memory';
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: memoryManagerRoutingPreferences,
|
||||
params: {
|
||||
settings: {
|
||||
|
||||
@@ -21,6 +21,8 @@ describe('Shell Efficiency', () => {
|
||||
};
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use --silent/--quiet flags when installing packages',
|
||||
prompt: 'Install the "lodash" package using npm.',
|
||||
assert: async (rig) => {
|
||||
@@ -50,6 +52,8 @@ describe('Shell Efficiency', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use --no-pager with git commands',
|
||||
prompt: 'Show the git log.',
|
||||
assert: async (rig) => {
|
||||
@@ -73,6 +77,8 @@ describe('Shell Efficiency', () => {
|
||||
});
|
||||
|
||||
evalTest('ALWAYS_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should NOT use efficiency flags when enableShellOutputEfficiency is disabled',
|
||||
params: {
|
||||
settings: {
|
||||
|
||||
@@ -45,6 +45,8 @@ describe('subagent eval test cases', () => {
|
||||
* This tests the system prompt's subagent specific clauses.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should delegate to user provided agent with relevant expertise',
|
||||
params: {
|
||||
settings: {
|
||||
@@ -69,6 +71,8 @@ describe('subagent eval test cases', () => {
|
||||
* subagents are available. This helps catch orchestration overuse.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should avoid delegating trivial direct edit work',
|
||||
params: {
|
||||
settings: {
|
||||
@@ -113,6 +117,8 @@ describe('subagent eval test cases', () => {
|
||||
* This is meant to codify the "overusing Generalist" failure mode.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should prefer relevant specialist over generalist',
|
||||
params: {
|
||||
settings: {
|
||||
@@ -149,6 +155,8 @@ describe('subagent eval test cases', () => {
|
||||
* naturally spans docs and tests, so multiple specialists should be used.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should use multiple relevant specialists for multi-surface task',
|
||||
params: {
|
||||
settings: {
|
||||
@@ -193,6 +201,8 @@ describe('subagent eval test cases', () => {
|
||||
* from a large pool of available subagents (10 total).
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should select the correct subagent from a pool of 10 different agents',
|
||||
prompt: 'Please add a new SQL table migration for a user profile.',
|
||||
files: {
|
||||
@@ -243,6 +253,8 @@ describe('subagent eval test cases', () => {
|
||||
* This test includes stress tests the subagent delegation with ~80 tools.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
|
||||
prompt: 'Please add a new SQL table migration for a user profile.',
|
||||
setup: async (rig) => {
|
||||
|
||||
@@ -49,6 +49,8 @@ describe('evalTest reliability logic', () => {
|
||||
|
||||
// Execute the test function directly
|
||||
await internalEvalTest({
|
||||
suiteName: 'test',
|
||||
suiteType: 'behavioral',
|
||||
name: 'test-api-failure',
|
||||
prompt: 'do something',
|
||||
assert: async () => {},
|
||||
@@ -83,6 +85,8 @@ describe('evalTest reliability logic', () => {
|
||||
// Expect the test function to throw immediately
|
||||
await expect(
|
||||
internalEvalTest({
|
||||
suiteName: 'test',
|
||||
suiteType: 'behavioral',
|
||||
name: 'test-logic-failure',
|
||||
prompt: 'do something',
|
||||
assert: async () => {
|
||||
@@ -108,6 +112,8 @@ describe('evalTest reliability logic', () => {
|
||||
.mockResolvedValueOnce('Success');
|
||||
|
||||
await internalEvalTest({
|
||||
suiteName: 'test',
|
||||
suiteType: 'behavioral',
|
||||
name: 'test-recovery',
|
||||
prompt: 'do something',
|
||||
assert: async () => {},
|
||||
@@ -135,6 +141,8 @@ describe('evalTest reliability logic', () => {
|
||||
);
|
||||
|
||||
await internalEvalTest({
|
||||
suiteName: 'test',
|
||||
suiteType: 'behavioral',
|
||||
name: 'test-api-503',
|
||||
prompt: 'do something',
|
||||
assert: async () => {},
|
||||
@@ -162,6 +170,8 @@ describe('evalTest reliability logic', () => {
|
||||
try {
|
||||
await expect(
|
||||
internalEvalTest({
|
||||
suiteName: 'test',
|
||||
suiteType: 'behavioral',
|
||||
name: 'test-absolute-path',
|
||||
prompt: 'do something',
|
||||
files: {
|
||||
@@ -190,6 +200,8 @@ describe('evalTest reliability logic', () => {
|
||||
try {
|
||||
await expect(
|
||||
internalEvalTest({
|
||||
suiteName: 'test',
|
||||
suiteType: 'behavioral',
|
||||
name: 'test-traversal',
|
||||
prompt: 'do something',
|
||||
files: {
|
||||
|
||||
+94
-54
@@ -16,10 +16,19 @@ import {
|
||||
Storage,
|
||||
getProjectHash,
|
||||
SESSION_FILE_PREFIX,
|
||||
PREVIEW_GEMINI_FLASH_MODEL,
|
||||
getErrorMessage,
|
||||
} from '@google/gemini-cli-core';
|
||||
|
||||
export * from '@google/gemini-cli-test-utils';
|
||||
|
||||
/**
|
||||
* The default model used for all evaluations.
|
||||
* Can be overridden by setting the GEMINI_MODEL environment variable.
|
||||
*/
|
||||
export const EVAL_MODEL =
|
||||
process.env['GEMINI_MODEL'] || PREVIEW_GEMINI_FLASH_MODEL;
|
||||
|
||||
// Indicates the consistency expectation for this test.
|
||||
// - ALWAYS_PASSES - Means that the test is expected to pass 100% of the time. These
|
||||
// These tests are typically trivial and test basic functionality with unambiguous
|
||||
@@ -39,19 +48,49 @@ export * from '@google/gemini-cli-test-utils';
|
||||
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
|
||||
|
||||
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
runEval(
|
||||
policy,
|
||||
evalCase.name,
|
||||
() => internalEvalTest(evalCase),
|
||||
evalCase.timeout,
|
||||
);
|
||||
runEval(policy, evalCase, () => internalEvalTest(evalCase));
|
||||
}
|
||||
|
||||
export async function internalEvalTest(evalCase: EvalCase) {
|
||||
export async function withEvalRetries(
|
||||
name: string,
|
||||
attemptFn: (attempt: number) => Promise<void>,
|
||||
) {
|
||||
const maxRetries = 3;
|
||||
let attempt = 0;
|
||||
|
||||
while (attempt <= maxRetries) {
|
||||
try {
|
||||
await attemptFn(attempt);
|
||||
return; // Success! Exit the retry loop.
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = getErrorMessage(error);
|
||||
const errorCode = getApiErrorCode(errorMessage);
|
||||
|
||||
if (errorCode) {
|
||||
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
|
||||
logReliabilityEvent(name, attempt, status, errorCode, errorMessage);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
attempt++;
|
||||
console.warn(
|
||||
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
|
||||
);
|
||||
continue; // Retry
|
||||
}
|
||||
|
||||
console.warn(
|
||||
`[Eval] '${name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
|
||||
);
|
||||
return; // Gracefully exit without failing the test
|
||||
}
|
||||
|
||||
throw error; // Real failure
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function internalEvalTest(evalCase: EvalCase) {
|
||||
await withEvalRetries(evalCase.name, async () => {
|
||||
const rig = new TestRig();
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
|
||||
@@ -59,14 +98,21 @@ export async function internalEvalTest(evalCase: EvalCase) {
|
||||
let isSuccess = false;
|
||||
|
||||
try {
|
||||
rig.setup(evalCase.name, evalCase.params);
|
||||
const setupOptions = {
|
||||
...evalCase.params,
|
||||
settings: {
|
||||
model: { name: EVAL_MODEL },
|
||||
...evalCase.params?.settings,
|
||||
},
|
||||
};
|
||||
rig.setup(evalCase.name, setupOptions);
|
||||
|
||||
if (evalCase.setup) {
|
||||
await evalCase.setup(rig);
|
||||
}
|
||||
|
||||
if (evalCase.files) {
|
||||
await setupTestFiles(rig, evalCase.files);
|
||||
await prepareWorkspace(rig.testDir!, rig.homeDir!, evalCase.files);
|
||||
}
|
||||
|
||||
symlinkNodeModules(rig.testDir || '');
|
||||
@@ -139,37 +185,6 @@ export async function internalEvalTest(evalCase: EvalCase) {
|
||||
|
||||
await evalCase.assert(rig, result);
|
||||
isSuccess = true;
|
||||
return; // Success! Exit the retry loop.
|
||||
} catch (error: unknown) {
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
const errorCode = getApiErrorCode(errorMessage);
|
||||
|
||||
if (errorCode) {
|
||||
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
|
||||
logReliabilityEvent(
|
||||
evalCase.name,
|
||||
attempt,
|
||||
status,
|
||||
errorCode,
|
||||
errorMessage,
|
||||
);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
attempt++;
|
||||
console.warn(
|
||||
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
|
||||
);
|
||||
continue; // Retry
|
||||
}
|
||||
|
||||
console.warn(
|
||||
`[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
|
||||
);
|
||||
return; // Gracefully exit without failing the test
|
||||
}
|
||||
|
||||
throw error; // Real failure
|
||||
} finally {
|
||||
if (isSuccess) {
|
||||
await fs.promises.unlink(activityLogFile).catch((err) => {
|
||||
@@ -188,7 +203,7 @@ export async function internalEvalTest(evalCase: EvalCase) {
|
||||
);
|
||||
await rig.cleanup();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function getApiErrorCode(message: string): '500' | '503' | undefined {
|
||||
@@ -226,7 +241,7 @@ function logReliabilityEvent(
|
||||
const reliabilityLog = {
|
||||
timestamp: new Date().toISOString(),
|
||||
testName,
|
||||
model: process.env.GEMINI_MODEL || 'unknown',
|
||||
model: process.env['GEMINI_MODEL'] || 'unknown',
|
||||
attempt,
|
||||
status,
|
||||
errorCode,
|
||||
@@ -252,9 +267,13 @@ function logReliabilityEvent(
|
||||
* intentionally uses synchronous filesystem and child_process operations
|
||||
* for simplicity and to ensure sequential environment preparation.
|
||||
*/
|
||||
async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
|
||||
export async function prepareWorkspace(
|
||||
testDir: string,
|
||||
homeDir: string,
|
||||
files: Record<string, string>,
|
||||
) {
|
||||
const acknowledgedAgents: Record<string, Record<string, string>> = {};
|
||||
const projectRoot = fs.realpathSync(rig.testDir!);
|
||||
const projectRoot = fs.realpathSync(testDir);
|
||||
|
||||
for (const [filePath, content] of Object.entries(files)) {
|
||||
if (filePath.includes('..') || path.isAbsolute(filePath)) {
|
||||
@@ -290,7 +309,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
|
||||
|
||||
if (Object.keys(acknowledgedAgents).length > 0) {
|
||||
const ackPath = path.join(
|
||||
rig.homeDir!,
|
||||
homeDir,
|
||||
'.gemini',
|
||||
'acknowledgments',
|
||||
'agents.json',
|
||||
@@ -299,7 +318,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
|
||||
fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
|
||||
}
|
||||
|
||||
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
|
||||
const execOptions = { cwd: testDir, stdio: 'ignore' as const };
|
||||
execSync('git init --initial-branch=main', execOptions);
|
||||
execSync('git config user.email "test@example.com"', execOptions);
|
||||
execSync('git config user.name "Test User"', execOptions);
|
||||
@@ -320,14 +339,30 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
|
||||
*/
|
||||
export function runEval(
|
||||
policy: EvalPolicy,
|
||||
name: string,
|
||||
evalCase: BaseEvalCase,
|
||||
fn: () => Promise<void>,
|
||||
timeout?: number,
|
||||
timeoutOverride?: number,
|
||||
) {
|
||||
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
|
||||
it.skip(name, fn);
|
||||
const { name, timeout, suiteName, suiteType } = evalCase;
|
||||
const targetSuiteType = process.env['EVAL_SUITE_TYPE'];
|
||||
const targetSuiteName = process.env['EVAL_SUITE_NAME'];
|
||||
|
||||
const meta = { suiteType, suiteName };
|
||||
|
||||
const skipBySuiteType =
|
||||
targetSuiteType && suiteType && suiteType !== targetSuiteType;
|
||||
const skipBySuiteName =
|
||||
targetSuiteName && suiteName && suiteName !== targetSuiteName;
|
||||
|
||||
const options = { timeout: timeoutOverride ?? timeout, meta };
|
||||
if (
|
||||
(policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) ||
|
||||
skipBySuiteType ||
|
||||
skipBySuiteName
|
||||
) {
|
||||
it.skip(name, options, fn);
|
||||
} else {
|
||||
it(name, fn, timeout);
|
||||
it(name, options, fn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -366,15 +401,20 @@ interface ForbiddenToolSettings {
|
||||
};
|
||||
}
|
||||
|
||||
export interface EvalCase {
|
||||
export interface BaseEvalCase {
|
||||
suiteName: string;
|
||||
suiteType: 'behavioral' | 'component-level' | 'hero-scenario';
|
||||
name: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface EvalCase extends BaseEvalCase {
|
||||
params?: {
|
||||
settings?: ForbiddenToolSettings & Record<string, unknown>;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
prompt: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
setup?: (rig: TestRig) => Promise<void> | void;
|
||||
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
|
||||
messages?: Record<string, unknown>[];
|
||||
|
||||
@@ -31,6 +31,8 @@ describe('Tool Output Masking Behavioral Evals', () => {
|
||||
* It should recognize the <tool_output_masked> tag and use a tool to read the file.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should attempt to read the redirected full output file when information is masked',
|
||||
params: {
|
||||
security: {
|
||||
@@ -167,6 +169,8 @@ Output too large. Full output available at: ${outputFilePath}
|
||||
* Scenario: Information is in the preview.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should NOT read the full output file when the information is already in the preview',
|
||||
params: {
|
||||
security: {
|
||||
|
||||
@@ -25,6 +25,8 @@ const FILES = {
|
||||
|
||||
describe('tracker_mode', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should manage tasks in the tracker when explicitly requested during a bug fix',
|
||||
params: {
|
||||
settings: { experimental: { taskTracker: true } },
|
||||
@@ -78,6 +80,8 @@ describe('tracker_mode', () => {
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should implicitly create tasks when asked to build a feature plan',
|
||||
params: {
|
||||
settings: { experimental: { taskTracker: true } },
|
||||
|
||||
@@ -9,6 +9,8 @@ import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('validation_fidelity', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should perform exhaustive validation autonomously when guided by system instructions',
|
||||
files: {
|
||||
'src/types.ts': `
|
||||
|
||||
@@ -9,6 +9,8 @@ import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('validation_fidelity_pre_existing_errors', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should handle pre-existing project errors gracefully during validation',
|
||||
files: {
|
||||
'src/math.ts': `
|
||||
|
||||
@@ -24,7 +24,10 @@ export default defineConfig({
|
||||
environment: 'node',
|
||||
globals: true,
|
||||
alias: {
|
||||
react: path.resolve(__dirname, '../node_modules/react'),
|
||||
'@google/gemini-cli-core': path.resolve(
|
||||
__dirname,
|
||||
'../packages/core/index.ts',
|
||||
),
|
||||
},
|
||||
setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
|
||||
server: {
|
||||
|
||||
Reference in New Issue
Block a user