feat(core): experimental in-progress steering hints (2 of 2) (#19307)

2026-06-14 21:37:20 -07:00 · 2026-02-18 14:05:50 -08:00
parent 81c8893e05
commit 87f5dd15d6
37 changed files with 1280 additions and 48 deletions
@@ -0,0 +1,89 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { act } from 'react';
+import path from 'node:path';
+import fs from 'node:fs';
+import { appEvalTest } from './app-test-helper.js';
+import { PolicyDecision } from '@google/gemini-cli-core';
+
+describe('Model Steering Behavioral Evals', () => {
+  appEvalTest('ALWAYS_PASSES', {
+    name: 'Corrective Hint: Model switches task based on hint during tool turn',
+    configOverrides: {
+      excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
+      modelSteering: true,
+    },
+    files: {
+      'README.md':
+        '# Gemini CLI\nThis is a tool for developers.\nLicense: Apache-2.0\nLine 4\nLine 5\nLine 6',
+    },
+    prompt: 'Find the first 5 lines of README.md',
+    setup: async (rig) => {
+      // Pause on any relevant tool to inject a corrective hint
+      rig.setBreakpoint(['read_file', 'list_directory', 'glob']);
+    },
+    assert: async (rig) => {
+      // Wait for the model to pause on any tool call
+      await rig.waitForPendingConfirmation(
+        /read_file|list_directory|glob/i,
+        30000,
+      );
+
+      // Interrupt with a corrective hint
+      await rig.addUserHint(
+        'Actually, stop what you are doing. Just tell me a short knock-knock joke about a robot instead.',
+      );
+
+      // Resolve the tool to let the turn finish and the model see the hint
+      await rig.resolveAwaitedTool();
+
+      // Verify the model pivots to the new task
+      await rig.waitForOutput(/Knock,? knock/i, 40000);
+      await rig.waitForIdle(30000);
+
+      const output = rig.getStaticOutput();
+      expect(output).toMatch(/Knock,? knock/i);
+      expect(output).not.toContain('Line 6');
+    },
+  });
+
+  appEvalTest('ALWAYS_PASSES', {
+    name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
+    configOverrides: {
+      excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
+      modelSteering: true,
+    },
+    files: {},
+    prompt: 'Create a file called "hw.js" with a JS hello world.',
+    setup: async (rig) => {
+      // Pause on write_file to inject a suggestive hint
+      rig.setBreakpoint(['write_file']);
+    },
+    assert: async (rig) => {
+      // Wait for the model to start creating the first file
+      await rig.waitForPendingConfirmation('write_file', 30000);
+
+      await rig.addUserHint(
+        'Next, create a file called "hw.py" with a python hello world.',
+      );
+
+      // Resolve and wait for the model to complete both tasks
+      await rig.resolveAwaitedTool();
+      await rig.waitForPendingConfirmation('write_file', 30000);
+      await rig.resolveAwaitedTool();
+      await rig.waitForIdle(60000);
+
+      const testDir = rig.getTestDir();
+      const hwJs = path.join(testDir, 'hw.js');
+      const hwPy = path.join(testDir, 'hw.py');
+
+      expect(fs.existsSync(hwJs), 'hw.js should exist').toBe(true);
+      expect(fs.existsSync(hwPy), 'hw.py should exist').toBe(true);
+    },
+  });
+});