mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-22 09:42:39 -07:00
Implement an eval creator skill.
This commit is contained in:
@@ -0,0 +1,78 @@
|
||||
---
|
||||
name: eval-creator
|
||||
description: Converts exported `chat.json` conversation files into minimal, anonymized, and reproducible behavioral evaluation tests.
|
||||
---
|
||||
|
||||
# eval-creator
|
||||
|
||||
## Description
|
||||
Converts exported `chat.json` conversation files into minimal, anonymized, and reproducible behavioral evaluation tests (`.eval.ts`) for the Gemini CLI `evalTest` framework.
|
||||
|
||||
## Instructions
|
||||
|
||||
When the user asks you to create an eval from a `chat.json` file, follow these steps strictly:
|
||||
|
||||
1. **Read and Identify Bugs:**
|
||||
* Use `read_file` to examine the contents of the provided `chat.json` file.
|
||||
* **CRITICAL:** Before deeply analyzing the file, ask the user what type of issue they are targeting:
|
||||
1. **Agent Misbehavior:** A bug with the agent itself (e.g., tool misuse, failing to follow instructions, hallucinating tool names/parameters, `replace` tool mismatch errors).
|
||||
2. **Solution Issue:** A problem with the code or strategy the agent produced (e.g., the code didn't compile, lint errors, missed architectural constraints, bad strategy).
|
||||
* Once the user clarifies the category, analyze the conversation history (the `Content[]` array) to identify *only* those failures, bugs, or regressions that match the specified category.
|
||||
* Present a concise summary of the matching potential bugs to the user and **ask them which specific issues they want you to create evaluations for.** Do NOT proceed to create files until the user confirms.
|
||||
|
||||
2. **Minimize and Anonymize:**
|
||||
* Identify the "Repro Turn": the specific user prompt where the agent failed.
|
||||
* Discard all preceding conversation turns that are not strictly necessary to set up the environment or trigger the behavior.
|
||||
* For the remaining preceding turns, anonymize and keep them in a `messages` array.
|
||||
* **CRITICAL:** Anonymize all data. Replace absolute file paths (e.g., `/Users/username/code/...`) with generic relative paths (e.g., `src/app.ts`). Remove any sensitive tokens, API keys, or personal information. Replace user-specific code with generic, simplified code snippets that still reproduce the issue.
|
||||
|
||||
3. **Reconstruct Initial State:**
|
||||
* Determine the minimal set of files and their contents required to exist *before* the target prompt is issued. This will become the `files` object in the test.
|
||||
|
||||
4. **Generate Test Code:**
|
||||
* Create a valid TypeScript file using the `evalTest` framework from `evals/test-helper.ts`.
|
||||
* Use the following template:
|
||||
|
||||
```typescript
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('chat-to-eval-generated', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should <describe expected behavior>',
|
||||
prompt: '<the repro user prompt>',
|
||||
// Use messages to replay a realistic history leading up to the repro prompt.
|
||||
// This makes the test faster and avoids re-executing established context.
|
||||
messages: [
|
||||
// { id: '...', type: 'user', content: [{ text: '...' }], timestamp: '...' },
|
||||
// { id: '...', type: 'gemini', content: [{ text: '...' }], timestamp: '...' },
|
||||
],
|
||||
files: {
|
||||
'path/to/mock/file.ts': '...',
|
||||
},
|
||||
assert: async (rig, result) => {
|
||||
// Add assertions based on expected model behavior.
|
||||
// Example:
|
||||
// const logs = rig.readToolLogs();
|
||||
// expect(logs.some(l => l.toolRequest.name === 'write_file')).toBe(true);
|
||||
// expect(rig.readFile('path/to/mock/file.ts')).toContain('...');
|
||||
},
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
5. **Validate and Finalize:**
|
||||
* **CRITICAL:** An evaluation is only valid to add if it is **initially failing** (demonstrating the bug or missing behavior). Skip this requirement only if you have clear evidence that the user has already applied a fix and is using the test for regression verification.
|
||||
* Inform the user that the file has been created.
|
||||
* Include instructions on how to run the test to verify failure/success:
|
||||
```bash
|
||||
# Run only the new eval test
|
||||
RUN_EVALS=1 npx vitest run --config evals/vitest.config.ts evals/<descriptive-name>.eval.ts
|
||||
|
||||
# Run all evals (including 'USUALLY_PASSES')
|
||||
npm run test:all_evals
|
||||
|
||||
# Run only 'ALWAYS_PASSES' evals
|
||||
npm run test:always_passing_evals
|
||||
```
|
||||
* Remind them to manually review the assertions and ensure complete anonymization before committing.
|
||||
+41
-1
@@ -32,6 +32,36 @@ CLI's features.
|
||||
|
||||
## Creating an Evaluation
|
||||
|
||||
To easily create a new evaluation, use the `eval-creator` skill with an exported
|
||||
conversation.
|
||||
|
||||
### Using the `eval-creator` skill
|
||||
|
||||
If you have a `chat.json` file exported from a session where the model exhibited
|
||||
incorrect behavior, use Gemini CLI to automatically generate a minimal,
|
||||
anonymized evaluation:
|
||||
|
||||
1. **Export the session:** Export the problematic conversation to a `chat.json`
|
||||
file.
|
||||
2. **Activate the skill:** Run the CLI and ask it to create an eval, activating
|
||||
the skill:
|
||||
|
||||
```bash
|
||||
gemini "Create an eval from my-repro-chat.json" --skill eval-creator
|
||||
```
|
||||
|
||||
Alternatively, start an interactive session, activate the skill with
|
||||
`/skill eval-creator`, and provide the file.
|
||||
|
||||
3. **Follow the prompts:** Answer the CLI prompts to clarify the type of issue
|
||||
(Agent Misbehavior vs. Solution Issue) and confirm which specific
|
||||
interactions to turn into a test.
|
||||
4. **Review the generated test:** Manually review the `.eval.ts` file generated
|
||||
in the `evals/` directory. Check the generated assertions and ensure the
|
||||
test is fully anonymized.
|
||||
|
||||
### Manual creation
|
||||
|
||||
Evaluations are located in the `evals` directory. Each evaluation is a Vitest
|
||||
test file that uses the `evalTest` function from `evals/test-helper.ts`.
|
||||
|
||||
@@ -58,7 +88,17 @@ behaviors.
|
||||
flakiness due to non-deterministic behaviors. These are run nightly and used
|
||||
to track the health of the product from build to build.
|
||||
|
||||
#### `EvalCase` Properties
|
||||
### Best Practices
|
||||
|
||||
- **Initially Failing**: An evaluation is only valid to add if it is **initially
|
||||
failing** (demonstrating the bug or missing behavior). Skip this requirement
|
||||
only if you have clear evidence that the bug has already been fixed and the
|
||||
test is for regression verification.
|
||||
- **Minimization**: Keep the `files` and `prompt` as minimal as possible to
|
||||
reproduce the behavior.
|
||||
- **Anonymization**: Always anonymize code, paths, and identifiers.
|
||||
|
||||
### `EvalCase` Properties
|
||||
|
||||
- `name`: The name of the evaluation case.
|
||||
- `prompt`: The prompt to send to the model.
|
||||
|
||||
+51
-1
@@ -13,6 +13,9 @@ import { TestRig } from '@google/gemini-cli-test-utils';
|
||||
import {
|
||||
createUnauthorizedToolError,
|
||||
parseAgentMarkdown,
|
||||
Storage,
|
||||
getProjectHash,
|
||||
SESSION_FILE_PREFIX,
|
||||
} from '@google/gemini-cli-core';
|
||||
|
||||
export * from '@google/gemini-cli-test-utils';
|
||||
@@ -35,6 +38,18 @@ export * from '@google/gemini-cli-test-utils';
|
||||
// This may take a really long time and is not recommended.
|
||||
export type EvalPolicy = 'ALWAYS_PASSES' | 'USUALLY_PASSES';
|
||||
|
||||
export interface EvalCase {
|
||||
name: string;
|
||||
params?: Record<string, any>;
|
||||
prompt: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
messages?: any[];
|
||||
sessionId?: string;
|
||||
approvalMode?: 'default' | 'auto_edit' | 'yolo' | 'plan';
|
||||
assert: (rig: TestRig, result: string) => Promise<void>;
|
||||
}
|
||||
|
||||
export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
const fn = async () => {
|
||||
const rig = new TestRig();
|
||||
@@ -116,8 +131,43 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
|
||||
}
|
||||
|
||||
let sessionId: string | undefined;
|
||||
if (evalCase.messages) {
|
||||
sessionId =
|
||||
evalCase.sessionId ||
|
||||
`test-session-${crypto.randomUUID().slice(0, 8)}`;
|
||||
const storage = new Storage(fs.realpathSync(rig.testDir!));
|
||||
|
||||
// We need to set the GEMINI_CLI_HOME env var so Storage.getGlobalGeminiDir() points to our fake home
|
||||
const originalGeminiHome = process.env['GEMINI_CLI_HOME'];
|
||||
try {
|
||||
process.env['GEMINI_CLI_HOME'] = rig.homeDir!;
|
||||
await storage.initialize();
|
||||
const chatsDir = path.join(storage.getProjectTempDir(), 'chats');
|
||||
fs.mkdirSync(chatsDir, { recursive: true });
|
||||
|
||||
const conversation = {
|
||||
sessionId,
|
||||
projectHash: getProjectHash(fs.realpathSync(rig.testDir!)),
|
||||
startTime: new Date().toISOString(),
|
||||
lastUpdated: new Date().toISOString(),
|
||||
messages: evalCase.messages,
|
||||
};
|
||||
|
||||
const filename = `${SESSION_FILE_PREFIX}${new Date().toISOString().slice(0, 10)}-${sessionId.slice(0, 8)}.json`;
|
||||
fs.writeFileSync(
|
||||
path.join(chatsDir, filename),
|
||||
JSON.stringify(conversation, null, 2),
|
||||
);
|
||||
} finally {
|
||||
process.env['GEMINI_CLI_HOME'] = originalGeminiHome;
|
||||
}
|
||||
}
|
||||
|
||||
const result = await rig.run({
|
||||
args: evalCase.prompt,
|
||||
args: sessionId
|
||||
? ['--resume', sessionId, evalCase.prompt]
|
||||
: evalCase.prompt,
|
||||
approvalMode: evalCase.approvalMode ?? 'yolo',
|
||||
timeout: evalCase.timeout,
|
||||
env: {
|
||||
|
||||
Reference in New Issue
Block a user