feat(core): experimental in-progress steering hints

This is a rebase / refactor of: https://github.com/google-gemini/gemini-cli/pull/18783
2026-04-30 06:54:15 -07:00 · 2026-02-11 21:14:29 +00:00
parent ef02cec2cd
commit 5ed64c7130
45 changed files with 2090 additions and 136 deletions
@@ -0,0 +1,76 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { it } from 'vitest';
+import { AppRig } from '../packages/cli/src/test-utils/AppRig.js';
+import type { EvalPolicy } from './test-helper.js';
+import fs from 'node:fs';
+import path from 'node:path';
+import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
+
+export interface AppEvalCase {
+  name: string;
+  configOverrides?: any;
+  prompt: string;
+  timeout?: number;
+  files?: Record<string, string>;
+  setup?: (rig: AppRig) => Promise<void>;
+  assert: (rig: AppRig, output: string) => Promise<void>;
+}
+
+/**
+ * A helper for running behavioral evaluations using the in-process AppRig.
+ * This matches the API of evalTest in test-helper.ts as closely as possible.
+ */
+export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
+  const fn = async () => {
+    const rig = new AppRig({
+      configOverrides: {
+        model: DEFAULT_GEMINI_MODEL,
+        ...evalCase.configOverrides,
+      },
+    });
+
+    try {
+      await rig.initialize();
+
+      // Setup initial files
+      if (evalCase.files) {
+        const testDir = rig.getTestDir();
+        for (const [filePath, content] of Object.entries(evalCase.files)) {
+          const fullPath = path.join(testDir, filePath);
+          fs.mkdirSync(path.dirname(fullPath), { recursive: true });
+          fs.writeFileSync(fullPath, content);
+        }
+      }
+
+      // Run custom setup if provided (e.g. for breakpoints)
+      if (evalCase.setup) {
+        await evalCase.setup(rig);
+      }
+
+      // Render the app!
+      rig.render();
+
+      // Wait for initial ready state
+      await rig.waitForIdle();
+
+      // Send the initial prompt
+      await rig.sendMessage(evalCase.prompt);
+
+      // Run assertion. Interaction-heavy tests can do their own waiting/steering here.
+      await evalCase.assert(rig, rig.getStaticOutput());
+    } finally {
+      await rig.unmount();
+    }
+  };
+
+  if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
+    it.skip(evalCase.name, fn);
+  } else {
+    it(evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
+  }
+}
@@ -0,0 +1,87 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { act } from 'react';
+import path from 'node:path';
+import fs from 'node:fs';
+import { appEvalTest } from './app-test-helper.js';
+import { PolicyDecision } from '@google/gemini-cli-core';
+
+describe('Model Steering Behavioral Evals', () => {
+  appEvalTest('ALWAYS_PASSES', {
+    name: 'Corrective Hint: Model switches task based on hint during tool turn',
+    configOverrides: {
+      excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
+    },
+    files: {
+      'README.md':
+        '# Gemini CLI\nThis is a tool for developers.\nLicense: Apache-2.0\nLine 4\nLine 5\nLine 6',
+    },
+    prompt: 'Find the first 5 lines of README.md',
+    setup: async (rig) => {
+      // Pause on any relevant tool to inject a corrective hint
+      rig.setBreakpoint(['read_file', 'list_directory', 'glob']);
+    },
+    assert: async (rig) => {
+      // Wait for the model to pause on any tool call
+      await rig.waitForPendingConfirmation(
+        /read_file|list_directory|glob/i,
+        30000,
+      );
+
+      // Interrupt with a corrective hint
+      await rig.addUserHint(
+        'Actually, stop what you are doing. Just tell me a short knock-knock joke about a robot instead.',
+      );
+
+      // Resolve the tool to let the turn finish and the model see the hint
+      await rig.resolveAwaitedTool();
+
+      // Verify the model pivots to the new task
+      await rig.waitForOutput(/Knock,? knock/i, 40000);
+      await rig.waitForIdle(30000);
+
+      const output = rig.getStaticOutput();
+      expect(output).toMatch(/Knock,? knock/i);
+      expect(output).not.toContain('Line 6');
+    },
+  });
+
+  appEvalTest('ALWAYS_PASSES', {
+    name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
+    configOverrides: {
+      excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
+    },
+    files: {},
+    prompt: 'Create a file called "hw.js" with a JS hello world.',
+    setup: async (rig) => {
+      // Pause on write_file to inject a suggestive hint
+      rig.setBreakpoint(['write_file']);
+    },
+    assert: async (rig) => {
+      // Wait for the model to start creating the first file
+      await rig.waitForPendingConfirmation('write_file', 30000);
+
+      await rig.addUserHint(
+        'Next, create a file called "hw.py" with a python hello world.',
+      );
+
+      // Resolve and wait for the model to complete both tasks
+      await rig.resolveAwaitedTool();
+      await rig.waitForPendingConfirmation('write_file', 30000);
+      await rig.resolveAwaitedTool();
+      await rig.waitForIdle(60000);
+
+      const testDir = rig.getTestDir();
+      const hwJs = path.join(testDir, 'hw.js');
+      const hwPy = path.join(testDir, 'hw.py');
+
+      expect(fs.existsSync(hwJs), 'hw.js should exist').toBe(true);
+      expect(fs.existsSync(hwPy), 'hw.py should exist').toBe(true);
+    },
+  });
+});
@@ -5,8 +5,15 @@
 */

 import { defineConfig } from 'vitest/config';
+import { fileURLToPath } from 'node:url';
+import * as path from 'node:path';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));

 export default defineConfig({
+  resolve: {
+    conditions: ['test'],
+  },
  test: {
    testTimeout: 300000, // 5 minutes
    reporters: ['default', 'json'],
@@ -14,5 +21,16 @@ export default defineConfig({
      json: 'evals/logs/report.json',
    },
    include: ['**/*.eval.ts'],
+    environment: 'node',
+    globals: true,
+    alias: {
+      react: path.resolve(__dirname, '../node_modules/react'),
+    },
+    setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
+    server: {
+      deps: {
+        inline: [/@google\/gemini-cli-core/],
+      },
+    },
  },
 });