mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-13 13:22:35 -07:00
Merge remote-tracking branch 'origin/main' into rename-directory-to-workspace
# Conflicts: # docs/cli/settings.md # docs/reference/configuration.md # packages/cli/src/config/settings.test.ts # packages/cli/src/config/settingsSchema.ts # packages/cli/src/services/CommandService.test.ts # packages/cli/src/services/FileCommandLoader.ts # packages/cli/src/ui/components/RewindViewer.test.tsx # packages/cli/src/ui/constants/tips.ts # packages/cli/src/ui/utils/borderStyles.test.tsx # packages/cli/src/utils/processUtils.test.ts # packages/cli/src/utils/sessionUtils.ts # packages/core/src/commands/memory.test.ts # packages/core/src/config/config.ts # packages/core/src/config/storage.test.ts # packages/core/src/core/__snapshots__/prompts.test.ts.snap # packages/core/src/scheduler/tool-executor.ts # packages/core/src/services/contextManager.ts # packages/core/src/utils/memoryDiscovery.test.ts # packages/core/src/utils/memoryDiscovery.ts # schemas/settings.schema.json
This commit is contained in:
+44
-1
@@ -3,7 +3,8 @@
|
||||
Behavioral evaluations (evals) are tests designed to validate the agent's
|
||||
behavior in response to specific prompts. They serve as a critical feedback loop
|
||||
for changes to system prompts, tool definitions, and other model-steering
|
||||
mechanisms.
|
||||
mechanisms, and as a tool for assessing feature reliability by model, and
|
||||
preventing regressions.
|
||||
|
||||
## Why Behavioral Evals?
|
||||
|
||||
@@ -30,6 +31,48 @@ CLI's features.
|
||||
those that are generally reliable but might occasionally vary
|
||||
(`USUALLY_PASSES`).
|
||||
|
||||
## Best Practices
|
||||
|
||||
When designing behavioral evals, aim for scenarios that accurately reflect
|
||||
real-world usage while remaining small and maintainable.
|
||||
|
||||
- **Realistic Complexity**: Evals should be complicated enough to be
|
||||
"realistic." They should operate on actual files and a source directory,
|
||||
mirroring how a real agent interacts with a workspace. Remember that the agent
|
||||
may behave differently in a larger codebase, so we want to avoid scenarios
|
||||
that are too simple to be realistic.
|
||||
- _Good_: An eval that provides a small, functional React component and asks
|
||||
the agent to add a specific feature, requiring it to read the file,
|
||||
understand the context, and write the correct changes.
|
||||
- _Bad_: An eval that simply asks the agent a trivia question or asks it to
|
||||
write a generic script without providing any local workspace context.
|
||||
- **Maintainable Size**: Evals should be small enough to reason about and
|
||||
maintain. We probably can't check in an entire repo as a test case, though
|
||||
over time we will want these evals to mature into more and more realistic
|
||||
scenarios.
|
||||
- _Good_: A test setup with 2-3 files (e.g., a source file, a config file, and
|
||||
a test file) that isolates the specific behavior being evaluated.
|
||||
- _Bad_: A test setup containing dozens of files from a complex framework
|
||||
where the setup logic itself is prone to breaking.
|
||||
- **Unambiguous and Reliable Assertions**: Assertions must be clear and specific
|
||||
to ensure the test passes for the right reason.
|
||||
- _Good_: Checking that a modified file contains a specific AST node or exact
|
||||
string, or verifying that a tool was called with with the right parameters.
|
||||
- _Bad_: Only checking for a tool call, which could happen for an unrelated
|
||||
reason. Expecting specific LLM output.
|
||||
- **Fail First**: Have tests that failed before your prompt or tool change. We
|
||||
want to be sure the test fails before your "fix". It's pretty easy to
|
||||
accidentally create a passing test that asserts behaviors we get for free. In
|
||||
general, every eval should be accompanied by prompt change, and most prompt
|
||||
changes should be accompanied by an eval.
|
||||
- _Good_: Observing a failure, writing an eval that reliably reproduces the
|
||||
failure, modifying the prompt/tool, and then verifying the eval passes.
|
||||
- _Bad_: Writing an eval that passes on the first run and assuming your new
|
||||
prompt change was responsible.
|
||||
- **Less is More**: Prefer fewer, more realistic tests that assert the major
|
||||
paths vs. more tests that are more unit-test like. These are evals, so the
|
||||
value is in testing how the agent works in a semi-realistic scenario.
|
||||
|
||||
## Creating an Evaluation
|
||||
|
||||
Evaluations are located in the `evals` directory. Each evaluation is a Vitest
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('ask_user', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'Agent uses AskUser tool to present multiple choice options',
|
||||
prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`,
|
||||
assert: async (rig) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('ask_user');
|
||||
expect(wasToolCalled, 'Expected ask_user tool to be called').toBe(true);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'Agent uses AskUser tool to clarify ambiguous requirements',
|
||||
files: {
|
||||
'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }),
|
||||
},
|
||||
prompt: `I want to build a new feature in this app. Ask me questions to clarify the requirements before proceeding.`,
|
||||
assert: async (rig) => {
|
||||
const wasToolCalled = await rig.waitForToolCall('ask_user');
|
||||
expect(wasToolCalled, 'Expected ask_user tool to be called').toBe(true);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'Agent uses AskUser tool before performing significant ambiguous rework',
|
||||
files: {
|
||||
'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";',
|
||||
'packages/core/src/util.ts': '// util\nexport function help() {}',
|
||||
'packages/core/package.json': JSON.stringify({
|
||||
name: '@google/gemini-cli-core',
|
||||
}),
|
||||
'README.md': '# Gemini CLI',
|
||||
},
|
||||
prompt: `Refactor the entire core package to be better.`,
|
||||
assert: async (rig) => {
|
||||
const wasPlanModeCalled = await rig.waitForToolCall('enter_plan_mode');
|
||||
expect(wasPlanModeCalled, 'Expected enter_plan_mode to be called').toBe(
|
||||
true,
|
||||
);
|
||||
|
||||
const wasAskUserCalled = await rig.waitForToolCall('ask_user');
|
||||
expect(
|
||||
wasAskUserCalled,
|
||||
'Expected ask_user tool to be called to clarify the significant rework',
|
||||
).toBe(true);
|
||||
},
|
||||
});
|
||||
|
||||
// --- Regression Tests for Recent Fixes ---
|
||||
|
||||
// Regression test for issue #20177: Ensure the agent does not use `ask_user` to
|
||||
// confirm shell commands. Fixed via prompt refinements and tool definition
|
||||
// updates to clarify that shell command confirmation is handled by the UI.
|
||||
// See fix: https://github.com/google-gemini/gemini-cli/pull/20504
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'Agent does NOT use AskUser to confirm shell commands',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
scripts: { build: 'echo building' },
|
||||
}),
|
||||
},
|
||||
prompt: `Run 'npm run build' in the current directory.`,
|
||||
assert: async (rig) => {
|
||||
await rig.waitForTelemetryReady();
|
||||
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const wasShellCalled = toolLogs.some(
|
||||
(log) => log.toolRequest.name === 'run_shell_command',
|
||||
);
|
||||
const wasAskUserCalled = toolLogs.some(
|
||||
(log) => log.toolRequest.name === 'ask_user',
|
||||
);
|
||||
|
||||
expect(
|
||||
wasShellCalled,
|
||||
'Expected run_shell_command tool to be called',
|
||||
).toBe(true);
|
||||
expect(
|
||||
wasAskUserCalled,
|
||||
'ask_user should not be called to confirm shell commands',
|
||||
).toBe(false);
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,56 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
const MUTATION_AGENT_DEFINITION = `---
|
||||
name: mutation-agent
|
||||
description: An agent that modifies the workspace (writes, deletes, git operations, etc).
|
||||
max_turns: 1
|
||||
tools:
|
||||
- write_file
|
||||
---
|
||||
|
||||
You are the mutation agent. Do the mutation requested.
|
||||
`;
|
||||
|
||||
describe('concurrency safety eval test cases', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'mutation agents are run in parallel when explicitly requested',
|
||||
params: {
|
||||
settings: {
|
||||
experimental: {
|
||||
enableAgents: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt:
|
||||
'Update A.txt to say "A" and update B.txt to say "B". Delegate these tasks to two separate mutation-agent subagents. You MUST run these subagents in parallel at the same time.',
|
||||
files: {
|
||||
'.gemini/agents/mutation-agent.md': MUTATION_AGENT_DEFINITION,
|
||||
},
|
||||
assert: async (rig) => {
|
||||
const logs = rig.readToolLogs();
|
||||
const mutationCalls = logs.filter(
|
||||
(log) => log.toolRequest?.name === 'mutation-agent',
|
||||
);
|
||||
|
||||
expect(
|
||||
mutationCalls.length,
|
||||
'Agent should have called the mutation-agent at least twice',
|
||||
).toBeGreaterThanOrEqual(2);
|
||||
|
||||
const firstPromptId = mutationCalls[0].toolRequest.prompt_id;
|
||||
const secondPromptId = mutationCalls[1].toolRequest.prompt_id;
|
||||
|
||||
expect(
|
||||
firstPromptId,
|
||||
'mutation agents should be called in parallel (same turn / prompt_ids) when explicitly requested',
|
||||
).toEqual(secondPromptId);
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -112,6 +112,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
// commands.
|
||||
execSync('git config core.editor "true"', execOptions);
|
||||
execSync('git config core.pager "cat"', execOptions);
|
||||
execSync('git config commit.gpgsign false', execOptions);
|
||||
execSync('git add .', execOptions);
|
||||
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user