From ac2ee4261fa6685ef6e75d8a93ac0b1c87e38f37 Mon Sep 17 00:00:00 2001 From: Christian Gunderman Date: Thu, 19 Feb 2026 19:02:14 -0800 Subject: [PATCH] Implement an eval creator skill. --- .gemini/skills/eval-creator/SKILL.md | 78 ++++++++++++++++++++++++++++ evals/README.md | 42 ++++++++++++++- evals/test-helper.ts | 52 ++++++++++++++++++- 3 files changed, 170 insertions(+), 2 deletions(-) create mode 100644 .gemini/skills/eval-creator/SKILL.md diff --git a/.gemini/skills/eval-creator/SKILL.md b/.gemini/skills/eval-creator/SKILL.md new file mode 100644 index 0000000000..7b43bf077d --- /dev/null +++ b/.gemini/skills/eval-creator/SKILL.md @@ -0,0 +1,78 @@ +--- +name: eval-creator +description: Converts exported `chat.json` conversation files into minimal, anonymized, and reproducible behavioral evaluation tests. +--- + +# eval-creator + +## Description +Converts exported `chat.json` conversation files into minimal, anonymized, and reproducible behavioral evaluation tests (`.eval.ts`) for the Gemini CLI `evalTest` framework. + +## Instructions + +When the user asks you to create an eval from a `chat.json` file, follow these steps strictly: + +1. **Read and Identify Bugs:** + * Use `read_file` to examine the contents of the provided `chat.json` file. + * **CRITICAL:** Before deeply analyzing the file, ask the user what type of issue they are targeting: + 1. **Agent Misbehavior:** A bug with the agent itself (e.g., tool misuse, failing to follow instructions, hallucinating tool names/parameters, `replace` tool mismatch errors). + 2. **Solution Issue:** A problem with the code or strategy the agent produced (e.g., the code didn't compile, lint errors, missed architectural constraints, bad strategy). + * Once the user clarifies the category, analyze the conversation history (the `Content[]` array) to identify *only* those failures, bugs, or regressions that match the specified category. + * Present a concise summary of the matching potential bugs to the user and **ask them which specific issues they want you to create evaluations for.** Do NOT proceed to create files until the user confirms. + +2. **Minimize and Anonymize:** + * Identify the "Repro Turn": the specific user prompt where the agent failed. + * Discard all preceding conversation turns that are not strictly necessary to set up the environment or trigger the behavior. + * For the remaining preceding turns, anonymize and keep them in a `messages` array. + * **CRITICAL:** Anonymize all data. Replace absolute file paths (e.g., `/Users/username/code/...`) with generic relative paths (e.g., `src/app.ts`). Remove any sensitive tokens, API keys, or personal information. Replace user-specific code with generic, simplified code snippets that still reproduce the issue. + +3. **Reconstruct Initial State:** + * Determine the minimal set of files and their contents required to exist *before* the target prompt is issued. This will become the `files` object in the test. + +4. **Generate Test Code:** + * Create a valid TypeScript file using the `evalTest` framework from `evals/test-helper.ts`. + * Use the following template: + +```typescript +import { describe, expect } from 'vitest'; +import { evalTest } from './test-helper.js'; + +describe('chat-to-eval-generated', () => { + evalTest('USUALLY_PASSES', { + name: 'should ', + prompt: '', + // Use messages to replay a realistic history leading up to the repro prompt. + // This makes the test faster and avoids re-executing established context. + messages: [ + // { id: '...', type: 'user', content: [{ text: '...' }], timestamp: '...' }, + // { id: '...', type: 'gemini', content: [{ text: '...' }], timestamp: '...' }, + ], + files: { + 'path/to/mock/file.ts': '...', + }, + assert: async (rig, result) => { + // Add assertions based on expected model behavior. + // Example: + // const logs = rig.readToolLogs(); + // expect(logs.some(l => l.toolRequest.name === 'write_file')).toBe(true); + // expect(rig.readFile('path/to/mock/file.ts')).toContain('...'); + }, + }); +}); +``` + +5. **Validate and Finalize:** + * **CRITICAL:** An evaluation is only valid to add if it is **initially failing** (demonstrating the bug or missing behavior). Skip this requirement only if you have clear evidence that the user has already applied a fix and is using the test for regression verification. + * Inform the user that the file has been created. + * Include instructions on how to run the test to verify failure/success: + ```bash + # Run only the new eval test + RUN_EVALS=1 npx vitest run --config evals/vitest.config.ts evals/.eval.ts + + # Run all evals (including 'USUALLY_PASSES') + npm run test:all_evals + + # Run only 'ALWAYS_PASSES' evals + npm run test:always_passing_evals + ``` + * Remind them to manually review the assertions and ensure complete anonymization before committing. diff --git a/evals/README.md b/evals/README.md index eb3cf2be70..cb6b6f3430 100644 --- a/evals/README.md +++ b/evals/README.md @@ -32,6 +32,36 @@ CLI's features. ## Creating an Evaluation +To easily create a new evaluation, use the `eval-creator` skill with an exported +conversation. + +### Using the `eval-creator` skill + +If you have a `chat.json` file exported from a session where the model exhibited +incorrect behavior, use Gemini CLI to automatically generate a minimal, +anonymized evaluation: + +1. **Export the session:** Export the problematic conversation to a `chat.json` + file. +2. **Activate the skill:** Run the CLI and ask it to create an eval, activating + the skill: + + ```bash + gemini "Create an eval from my-repro-chat.json" --skill eval-creator + ``` + + Alternatively, start an interactive session, activate the skill with + `/skill eval-creator`, and provide the file. + +3. **Follow the prompts:** Answer the CLI prompts to clarify the type of issue + (Agent Misbehavior vs. Solution Issue) and confirm which specific + interactions to turn into a test. +4. **Review the generated test:** Manually review the `.eval.ts` file generated + in the `evals/` directory. Check the generated assertions and ensure the + test is fully anonymized. + +### Manual creation + Evaluations are located in the `evals` directory. Each evaluation is a Vitest test file that uses the `evalTest` function from `evals/test-helper.ts`. @@ -58,7 +88,17 @@ behaviors. flakiness due to non-deterministic behaviors. These are run nightly and used to track the health of the product from build to build. -#### `EvalCase` Properties +### Best Practices + +- **Initially Failing**: An evaluation is only valid to add if it is **initially + failing** (demonstrating the bug or missing behavior). Skip this requirement + only if you have clear evidence that the bug has already been fixed and the + test is for regression verification. +- **Minimization**: Keep the `files` and `prompt` as minimal as possible to + reproduce the behavior. +- **Anonymization**: Always anonymize code, paths, and identifiers. + +### `EvalCase` Properties - `name`: The name of the evaluation case. - `prompt`: The prompt to send to the model. diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 44c538c197..5ea5163810 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -13,6 +13,9 @@ import { TestRig } from '@google/gemini-cli-test-utils'; import { createUnauthorizedToolError, parseAgentMarkdown, + Storage, + getProjectHash, + SESSION_FILE_PREFIX, } from '@google/gemini-cli-core'; export * from '@google/gemini-cli-test-utils'; @@ -35,6 +38,18 @@ export * from '@google/gemini-cli-test-utils'; // This may take a really long time and is not recommended. export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES'; +export interface EvalCase { + name: string; + params?: Record; + prompt: string; + timeout?: number; + files?: Record; + messages?: any[]; + sessionId?: string; + approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan'; + assert: (rig: TestRig, result: string) => Promise; +} + export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { const fn = async () => { const rig = new TestRig(); @@ -116,8 +131,43 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { execSync('git commit --allow-empty -m "Initial commit"', execOptions); } + let sessionId: string | undefined; + if (evalCase.messages) { + sessionId = + evalCase.sessionId || + `test-session-${crypto.randomUUID().slice(0, 8)}`; + const storage = new Storage(fs.realpathSync(rig.testDir!)); + + // We need to set the GEMINI_CLI_HOME env var so Storage.getGlobalGeminiDir() points to our fake home + const originalGeminiHome = process.env['GEMINI_CLI_HOME']; + try { + process.env['GEMINI_CLI_HOME'] = rig.homeDir!; + await storage.initialize(); + const chatsDir = path.join(storage.getProjectTempDir(), 'chats'); + fs.mkdirSync(chatsDir, { recursive: true }); + + const conversation = { + sessionId, + projectHash: getProjectHash(fs.realpathSync(rig.testDir!)), + startTime: new Date().toISOString(), + lastUpdated: new Date().toISOString(), + messages: evalCase.messages, + }; + + const filename = `${SESSION_FILE_PREFIX}${new Date().toISOString().slice(0, 10)}-${sessionId.slice(0, 8)}.json`; + fs.writeFileSync( + path.join(chatsDir, filename), + JSON.stringify(conversation, null, 2), + ); + } finally { + process.env['GEMINI_CLI_HOME'] = originalGeminiHome; + } + } + const result = await rig.run({ - args: evalCase.prompt, + args: sessionId + ? ['--resume', sessionId, evalCase.prompt] + : evalCase.prompt, approvalMode: evalCase.approvalMode ?? 'yolo', timeout: evalCase.timeout, env: {