feat(core): experimental in-progress steering hints

This is a rebase / refactor of:
https://github.com/google-gemini/gemini-cli/pull/18783
This commit is contained in:
Your Name
2026-02-11 21:14:29 +00:00
parent ef02cec2cd
commit 5ed64c7130
45 changed files with 2090 additions and 136 deletions
+76
View File
@@ -0,0 +1,76 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { it } from 'vitest';
import { AppRig } from '../packages/cli/src/test-utils/AppRig.js';
import type { EvalPolicy } from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
export interface AppEvalCase {
name: string;
configOverrides?: any;
prompt: string;
timeout?: number;
files?: Record<string, string>;
setup?: (rig: AppRig) => Promise<void>;
assert: (rig: AppRig, output: string) => Promise<void>;
}
/**
* A helper for running behavioral evaluations using the in-process AppRig.
* This matches the API of evalTest in test-helper.ts as closely as possible.
*/
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const fn = async () => {
const rig = new AppRig({
configOverrides: {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
},
});
try {
await rig.initialize();
// Setup initial files
if (evalCase.files) {
const testDir = rig.getTestDir();
for (const [filePath, content] of Object.entries(evalCase.files)) {
const fullPath = path.join(testDir, filePath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
}
}
// Run custom setup if provided (e.g. for breakpoints)
if (evalCase.setup) {
await evalCase.setup(rig);
}
// Render the app!
rig.render();
// Wait for initial ready state
await rig.waitForIdle();
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
await evalCase.assert(rig, rig.getStaticOutput());
} finally {
await rig.unmount();
}
};
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
it.skip(evalCase.name, fn);
} else {
it(evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
}
}
+87
View File
@@ -0,0 +1,87 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { act } from 'react';
import path from 'node:path';
import fs from 'node:fs';
import { appEvalTest } from './app-test-helper.js';
import { PolicyDecision } from '@google/gemini-cli-core';
describe('Model Steering Behavioral Evals', () => {
appEvalTest('ALWAYS_PASSES', {
name: 'Corrective Hint: Model switches task based on hint during tool turn',
configOverrides: {
excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
},
files: {
'README.md':
'# Gemini CLI\nThis is a tool for developers.\nLicense: Apache-2.0\nLine 4\nLine 5\nLine 6',
},
prompt: 'Find the first 5 lines of README.md',
setup: async (rig) => {
// Pause on any relevant tool to inject a corrective hint
rig.setBreakpoint(['read_file', 'list_directory', 'glob']);
},
assert: async (rig) => {
// Wait for the model to pause on any tool call
await rig.waitForPendingConfirmation(
/read_file|list_directory|glob/i,
30000,
);
// Interrupt with a corrective hint
await rig.addUserHint(
'Actually, stop what you are doing. Just tell me a short knock-knock joke about a robot instead.',
);
// Resolve the tool to let the turn finish and the model see the hint
await rig.resolveAwaitedTool();
// Verify the model pivots to the new task
await rig.waitForOutput(/Knock,? knock/i, 40000);
await rig.waitForIdle(30000);
const output = rig.getStaticOutput();
expect(output).toMatch(/Knock,? knock/i);
expect(output).not.toContain('Line 6');
},
});
appEvalTest('ALWAYS_PASSES', {
name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
configOverrides: {
excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
},
files: {},
prompt: 'Create a file called "hw.js" with a JS hello world.',
setup: async (rig) => {
// Pause on write_file to inject a suggestive hint
rig.setBreakpoint(['write_file']);
},
assert: async (rig) => {
// Wait for the model to start creating the first file
await rig.waitForPendingConfirmation('write_file', 30000);
await rig.addUserHint(
'Next, create a file called "hw.py" with a python hello world.',
);
// Resolve and wait for the model to complete both tasks
await rig.resolveAwaitedTool();
await rig.waitForPendingConfirmation('write_file', 30000);
await rig.resolveAwaitedTool();
await rig.waitForIdle(60000);
const testDir = rig.getTestDir();
const hwJs = path.join(testDir, 'hw.js');
const hwPy = path.join(testDir, 'hw.py');
expect(fs.existsSync(hwJs), 'hw.js should exist').toBe(true);
expect(fs.existsSync(hwPy), 'hw.py should exist').toBe(true);
},
});
});
+18
View File
@@ -5,8 +5,15 @@
*/
import { defineConfig } from 'vitest/config';
import { fileURLToPath } from 'node:url';
import * as path from 'node:path';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
export default defineConfig({
resolve: {
conditions: ['test'],
},
test: {
testTimeout: 300000, // 5 minutes
reporters: ['default', 'json'],
@@ -14,5 +21,16 @@ export default defineConfig({
json: 'evals/logs/report.json',
},
include: ['**/*.eval.ts'],
environment: 'node',
globals: true,
alias: {
react: path.resolve(__dirname, '../node_modules/react'),
},
setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
server: {
deps: {
inline: [/@google\/gemini-cli-core/],
},
},
},
});