mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-30 06:54:15 -07:00
feat(core): experimental in-progress steering hints
This is a rebase / refactor of: https://github.com/google-gemini/gemini-cli/pull/18783
This commit is contained in:
@@ -0,0 +1,76 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { it } from 'vitest';
|
||||
import { AppRig } from '../packages/cli/src/test-utils/AppRig.js';
|
||||
import type { EvalPolicy } from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
|
||||
|
||||
export interface AppEvalCase {
|
||||
name: string;
|
||||
configOverrides?: any;
|
||||
prompt: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
setup?: (rig: AppRig) => Promise<void>;
|
||||
assert: (rig: AppRig, output: string) => Promise<void>;
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper for running behavioral evaluations using the in-process AppRig.
|
||||
* This matches the API of evalTest in test-helper.ts as closely as possible.
|
||||
*/
|
||||
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
const fn = async () => {
|
||||
const rig = new AppRig({
|
||||
configOverrides: {
|
||||
model: DEFAULT_GEMINI_MODEL,
|
||||
...evalCase.configOverrides,
|
||||
},
|
||||
});
|
||||
|
||||
try {
|
||||
await rig.initialize();
|
||||
|
||||
// Setup initial files
|
||||
if (evalCase.files) {
|
||||
const testDir = rig.getTestDir();
|
||||
for (const [filePath, content] of Object.entries(evalCase.files)) {
|
||||
const fullPath = path.join(testDir, filePath);
|
||||
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
||||
fs.writeFileSync(fullPath, content);
|
||||
}
|
||||
}
|
||||
|
||||
// Run custom setup if provided (e.g. for breakpoints)
|
||||
if (evalCase.setup) {
|
||||
await evalCase.setup(rig);
|
||||
}
|
||||
|
||||
// Render the app!
|
||||
rig.render();
|
||||
|
||||
// Wait for initial ready state
|
||||
await rig.waitForIdle();
|
||||
|
||||
// Send the initial prompt
|
||||
await rig.sendMessage(evalCase.prompt);
|
||||
|
||||
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
|
||||
await evalCase.assert(rig, rig.getStaticOutput());
|
||||
} finally {
|
||||
await rig.unmount();
|
||||
}
|
||||
};
|
||||
|
||||
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
|
||||
it.skip(evalCase.name, fn);
|
||||
} else {
|
||||
it(evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { act } from 'react';
|
||||
import path from 'node:path';
|
||||
import fs from 'node:fs';
|
||||
import { appEvalTest } from './app-test-helper.js';
|
||||
import { PolicyDecision } from '@google/gemini-cli-core';
|
||||
|
||||
describe('Model Steering Behavioral Evals', () => {
|
||||
appEvalTest('ALWAYS_PASSES', {
|
||||
name: 'Corrective Hint: Model switches task based on hint during tool turn',
|
||||
configOverrides: {
|
||||
excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
|
||||
},
|
||||
files: {
|
||||
'README.md':
|
||||
'# Gemini CLI\nThis is a tool for developers.\nLicense: Apache-2.0\nLine 4\nLine 5\nLine 6',
|
||||
},
|
||||
prompt: 'Find the first 5 lines of README.md',
|
||||
setup: async (rig) => {
|
||||
// Pause on any relevant tool to inject a corrective hint
|
||||
rig.setBreakpoint(['read_file', 'list_directory', 'glob']);
|
||||
},
|
||||
assert: async (rig) => {
|
||||
// Wait for the model to pause on any tool call
|
||||
await rig.waitForPendingConfirmation(
|
||||
/read_file|list_directory|glob/i,
|
||||
30000,
|
||||
);
|
||||
|
||||
// Interrupt with a corrective hint
|
||||
await rig.addUserHint(
|
||||
'Actually, stop what you are doing. Just tell me a short knock-knock joke about a robot instead.',
|
||||
);
|
||||
|
||||
// Resolve the tool to let the turn finish and the model see the hint
|
||||
await rig.resolveAwaitedTool();
|
||||
|
||||
// Verify the model pivots to the new task
|
||||
await rig.waitForOutput(/Knock,? knock/i, 40000);
|
||||
await rig.waitForIdle(30000);
|
||||
|
||||
const output = rig.getStaticOutput();
|
||||
expect(output).toMatch(/Knock,? knock/i);
|
||||
expect(output).not.toContain('Line 6');
|
||||
},
|
||||
});
|
||||
|
||||
appEvalTest('ALWAYS_PASSES', {
|
||||
name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
|
||||
configOverrides: {
|
||||
excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
|
||||
},
|
||||
files: {},
|
||||
prompt: 'Create a file called "hw.js" with a JS hello world.',
|
||||
setup: async (rig) => {
|
||||
// Pause on write_file to inject a suggestive hint
|
||||
rig.setBreakpoint(['write_file']);
|
||||
},
|
||||
assert: async (rig) => {
|
||||
// Wait for the model to start creating the first file
|
||||
await rig.waitForPendingConfirmation('write_file', 30000);
|
||||
|
||||
await rig.addUserHint(
|
||||
'Next, create a file called "hw.py" with a python hello world.',
|
||||
);
|
||||
|
||||
// Resolve and wait for the model to complete both tasks
|
||||
await rig.resolveAwaitedTool();
|
||||
await rig.waitForPendingConfirmation('write_file', 30000);
|
||||
await rig.resolveAwaitedTool();
|
||||
await rig.waitForIdle(60000);
|
||||
|
||||
const testDir = rig.getTestDir();
|
||||
const hwJs = path.join(testDir, 'hw.js');
|
||||
const hwPy = path.join(testDir, 'hw.py');
|
||||
|
||||
expect(fs.existsSync(hwJs), 'hw.js should exist').toBe(true);
|
||||
expect(fs.existsSync(hwPy), 'hw.py should exist').toBe(true);
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -5,8 +5,15 @@
|
||||
*/
|
||||
|
||||
import { defineConfig } from 'vitest/config';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import * as path from 'node:path';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
export default defineConfig({
|
||||
resolve: {
|
||||
conditions: ['test'],
|
||||
},
|
||||
test: {
|
||||
testTimeout: 300000, // 5 minutes
|
||||
reporters: ['default', 'json'],
|
||||
@@ -14,5 +21,16 @@ export default defineConfig({
|
||||
json: 'evals/logs/report.json',
|
||||
},
|
||||
include: ['**/*.eval.ts'],
|
||||
environment: 'node',
|
||||
globals: true,
|
||||
alias: {
|
||||
react: path.resolve(__dirname, '../node_modules/react'),
|
||||
},
|
||||
setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
|
||||
server: {
|
||||
deps: {
|
||||
inline: [/@google\/gemini-cli-core/],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user