Implement an eval creator skill.

This commit is contained in:
Christian Gunderman
2026-02-19 19:02:14 -08:00
parent cbfb2a4e26
commit ac2ee4261f
3 changed files with 170 additions and 2 deletions
+41 -1
View File
@@ -32,6 +32,36 @@ CLI's features.
## Creating an Evaluation
To easily create a new evaluation, use the `eval-creator` skill with an exported
conversation.
### Using the `eval-creator` skill
If you have a `chat.json` file exported from a session where the model exhibited
incorrect behavior, use Gemini CLI to automatically generate a minimal,
anonymized evaluation:
1. **Export the session:** Export the problematic conversation to a `chat.json`
file.
2. **Activate the skill:** Run the CLI and ask it to create an eval, activating
the skill:
```bash
gemini "Create an eval from my-repro-chat.json" --skill eval-creator
```
Alternatively, start an interactive session, activate the skill with
`/skill eval-creator`, and provide the file.
3. **Follow the prompts:** Answer the CLI prompts to clarify the type of issue
(Agent Misbehavior vs. Solution Issue) and confirm which specific
interactions to turn into a test.
4. **Review the generated test:** Manually review the `.eval.ts` file generated
in the `evals/` directory. Check the generated assertions and ensure the
test is fully anonymized.
### Manual creation
Evaluations are located in the `evals` directory. Each evaluation is a Vitest
test file that uses the `evalTest` function from `evals/test-helper.ts`.
@@ -58,7 +88,17 @@ behaviors.
flakiness due to non-deterministic behaviors. These are run nightly and used
to track the health of the product from build to build.
#### `EvalCase` Properties
### Best Practices
- **Initially Failing**: An evaluation is only valid to add if it is **initially
failing** (demonstrating the bug or missing behavior). Skip this requirement
only if you have clear evidence that the bug has already been fixed and the
test is for regression verification.
- **Minimization**: Keep the `files` and `prompt` as minimal as possible to
reproduce the behavior.
- **Anonymization**: Always anonymize code, paths, and identifiers.
### `EvalCase` Properties
- `name`: The name of the evaluation case.
- `prompt`: The prompt to send to the model.
+51 -1
View File
@@ -13,6 +13,9 @@ import { TestRig } from '@google/gemini-cli-test-utils';
import {
createUnauthorizedToolError,
parseAgentMarkdown,
Storage,
getProjectHash,
SESSION_FILE_PREFIX,
} from '@google/gemini-cli-core';
export * from '@google/gemini-cli-test-utils';
@@ -35,6 +38,18 @@ export * from '@google/gemini-cli-test-utils';
// This may take a really long time and is not recommended.
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
export interface EvalCase {
name: string;
params?: Record<string, any>;
prompt: string;
timeout?: number;
files?: Record<string, string>;
messages?: any[];
sessionId?: string;
approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
assert: (rig: TestRig, result: string) => Promise<void>;
}
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
const fn = async () => {
const rig = new TestRig();
@@ -116,8 +131,43 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
}
let sessionId: string | undefined;
if (evalCase.messages) {
sessionId =
evalCase.sessionId ||
`test-session-${crypto.randomUUID().slice(0, 8)}`;
const storage = new Storage(fs.realpathSync(rig.testDir!));
// We need to set the GEMINI_CLI_HOME env var so Storage.getGlobalGeminiDir() points to our fake home
const originalGeminiHome = process.env['GEMINI_CLI_HOME'];
try {
process.env['GEMINI_CLI_HOME'] = rig.homeDir!;
await storage.initialize();
const chatsDir = path.join(storage.getProjectTempDir(), 'chats');
fs.mkdirSync(chatsDir, { recursive: true });
const conversation = {
sessionId,
projectHash: getProjectHash(fs.realpathSync(rig.testDir!)),
startTime: new Date().toISOString(),
lastUpdated: new Date().toISOString(),
messages: evalCase.messages,
};
const filename = `${SESSION_FILE_PREFIX}${new Date().toISOString().slice(0, 10)}-${sessionId.slice(0, 8)}.json`;
fs.writeFileSync(
path.join(chatsDir, filename),
JSON.stringify(conversation, null, 2),
);
} finally {
process.env['GEMINI_CLI_HOME'] = originalGeminiHome;
}
}
const result = await rig.run({
args: evalCase.prompt,
args: sessionId
? ['--resume', sessionId, evalCase.prompt]
: evalCase.prompt,
approvalMode: evalCase.approvalMode ?? 'yolo',
timeout: evalCase.timeout,
env: {