feat(core): experimental in-progress steering hints (1 of 3) (#19008)

2026-04-24 20:14:44 -07:00 · 2026-02-17 14:59:33 -08:00
parent 5e2f5df62c
commit 55c628e967
20 changed files with 1381 additions and 60 deletions
@@ -0,0 +1,86 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { AppRig } from '../packages/cli/src/test-utils/AppRig.js';
+import {
+  type EvalPolicy,
+  runEval,
+  prepareLogDir,
+  symlinkNodeModules,
+} from './test-helper.js';
+import fs from 'node:fs';
+import path from 'node:path';
+import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
+
+export interface AppEvalCase {
+  name: string;
+  configOverrides?: any;
+  prompt: string;
+  timeout?: number;
+  files?: Record<string, string>;
+  setup?: (rig: AppRig) => Promise<void>;
+  assert: (rig: AppRig, output: string) => Promise<void>;
+}
+
+/**
+ * A helper for running behavioral evaluations using the in-process AppRig.
+ * This matches the API of evalTest in test-helper.ts as closely as possible.
+ */
+export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
+  const fn = async () => {
+    const rig = new AppRig({
+      configOverrides: {
+        model: DEFAULT_GEMINI_MODEL,
+        ...evalCase.configOverrides,
+      },
+    });
+
+    const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
+    const logFile = path.join(logDir, `${sanitizedName}.log`);
+
+    try {
+      await rig.initialize();
+
+      const testDir = rig.getTestDir();
+      symlinkNodeModules(testDir);
+
+      // Setup initial files
+      if (evalCase.files) {
+        for (const [filePath, content] of Object.entries(evalCase.files)) {
+          const fullPath = path.join(testDir, filePath);
+          fs.mkdirSync(path.dirname(fullPath), { recursive: true });
+          fs.writeFileSync(fullPath, content);
+        }
+      }
+
+      // Run custom setup if provided (e.g. for breakpoints)
+      if (evalCase.setup) {
+        await evalCase.setup(rig);
+      }
+
+      // Render the app!
+      rig.render();
+
+      // Wait for initial ready state
+      await rig.waitForIdle();
+
+      // Send the initial prompt
+      await rig.sendMessage(evalCase.prompt);
+
+      // Run assertion. Interaction-heavy tests can do their own waiting/steering here.
+      const output = rig.getStaticOutput();
+      await evalCase.assert(rig, output);
+    } finally {
+      const output = rig.getStaticOutput();
+      if (output) {
+        await fs.promises.writeFile(logFile, output);
+      }
+      await rig.unmount();
+    }
+  };
+
+  runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
+}
@@ -47,11 +47,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {

      // Symlink node modules to reduce the amount of time needed to
      // bootstrap test projects.
-      const rootNodeModules = path.join(process.cwd(), 'node_modules');
-      const testNodeModules = path.join(rig.testDir || '', 'node_modules');
-      if (fs.existsSync(rootNodeModules) && !fs.existsSync(testNodeModules)) {
-        fs.symlinkSync(rootNodeModules, testNodeModules, 'dir');
-      }
+      symlinkNodeModules(rig.testDir || '');

      if (evalCase.files) {
        const acknowledgedAgents: Record<string, Record<string, string>> = {};
@@ -159,20 +155,47 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
    }
  };

+  runEval(policy, evalCase.name, fn, evalCase.timeout);
+}
+
+/**
+ * Wraps a test function with the appropriate Vitest 'it' or 'it.skip' based on policy.
+ */
+export function runEval(
+  policy: EvalPolicy,
+  name: string,
+  fn: () => Promise<void>,
+  timeout?: number,
+) {
  if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
-    it.skip(evalCase.name, fn);
+    it.skip(name, fn);
  } else {
-    it(evalCase.name, fn, evalCase.timeout);
+    it(name, fn, timeout);
  }
 }

-async function prepareLogDir(name: string) {
+export async function prepareLogDir(name: string) {
  const logDir = path.resolve(process.cwd(), 'evals/logs');
  await fs.promises.mkdir(logDir, { recursive: true });
  const sanitizedName = name.replace(/[^a-z0-9]/gi, '_').toLowerCase();
  return { logDir, sanitizedName };
 }

+/**
+ * Symlinks node_modules to the test directory to speed up tests that need to run tools.
+ */
+export function symlinkNodeModules(testDir: string) {
+  const rootNodeModules = path.join(process.cwd(), 'node_modules');
+  const testNodeModules = path.join(testDir, 'node_modules');
+  if (
+    testDir &&
+    fs.existsSync(rootNodeModules) &&
+    !fs.existsSync(testNodeModules)
+  ) {
+    fs.symlinkSync(rootNodeModules, testNodeModules, 'dir');
+  }
+}
+
 export interface EvalCase {
  name: string;
  params?: Record<string, any>;
@@ -5,8 +5,15 @@
 */

 import { defineConfig } from 'vitest/config';
+import { fileURLToPath } from 'node:url';
+import * as path from 'node:path';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));

 export default defineConfig({
+  resolve: {
+    conditions: ['test'],
+  },
  test: {
    testTimeout: 300000, // 5 minutes
    reporters: ['default', 'json'],
@@ -14,5 +21,16 @@ export default defineConfig({
      json: 'evals/logs/report.json',
    },
    include: ['**/*.eval.ts'],
+    environment: 'node',
+    globals: true,
+    alias: {
+      react: path.resolve(__dirname, '../node_modules/react'),
+    },
+    setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
+    server: {
+      deps: {
+        inline: [/@google\/gemini-cli-core/],
+      },
+    },
  },
 });